plugin-gallery-generator

#!/usr/bin/env python3

"""

The plugin gallery generator retrieves Munin plugins from configured sources.

The plugins are parsed and relevant meta data is extracted (programming language, capabilities,
graph categories).

The plugin data is used for generating a static website via the static website generator "hugo".

The default configuration supplied with this generator (siehe "config.yml")
is used for the Munin Plugin Gallery (https://gallery.munin-monitoring.org/).


Copyright 2020, Lars Kruse <devel@sumpfralle.de>


SPDX-License-Identifier: GPL-3.0-or-later

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""


import argparse
import asyncio
import asyncio.subprocess
import collections
import datetime
import enum
import json
import logging
import multiprocessing
import os
import pathlib
import re
import shutil
import sys
import tempfile
import time
import urllib.request

import aiohttp
import yaml


EXAMPLE_GRAPH_DIRECTORY_NAME = "example-graphs"
INDEXING_IGNORE_WORDS_FILE = os.path.join(
    os.path.dirname(__file__), "indexing-ignore-words.txt"
)
SPDX_LICENSE_DATA_URL = (
    "https://raw.githubusercontent.com/spdx/license-list-data/master/json/licenses.json"
)


class RepositorySourceType(enum.Enum):
    GIT = "git"
    ARCHIVE = "archive"
    DIRECTORY = "directory"


class YamlDataDumper(yaml.Dumper):
    """provide representations for a few non-default data types"""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.add_representer(
            collections.OrderedDict,
            lambda dumper, data: dumper.represent_dict(data.items()),
        )
        self.add_representer(tuple, lambda dumper, data: dumper.represent_list(data))


class MuninPluginExampleGraph(
    collections.namedtuple("MuninPluginExampleGraph", "key filename")
):
    def _get_sort_key(self):
        """calculate a sorting weight based on the "key"

        The special keys for daily, weekly, monthly and yearly graphs are supposed to appear first.
        Numeric keys follow.
        All other keys are used as sorting keys without further processing.
        """
        try:
            return (
                {"day": -4, "weeky": -3, "month": -2, "year": -1}[self.key.lower()],
                "",
            )
        except KeyError:
            pass
        try:
            return (int(self.key), "")
        except ValueError:
            pass
        return (100, self.key)

    def __lt__(self, other):
        return self._get_sort_key() < other._get_sort_key()


class MuninPluginRepositoryProcessingError(IOError):
    """any kind of error happened while processing a plugin source"""


class ConfigurationImportError(ValueError):
    """any kind of data validation problem encountered while processing the configuration file"""


class LicenseInformation(
    collections.namedtuple("LicenseInformation", ("name", "key", "url"))
):
    pass


class LicenseParser:
    """Determine the license of a given source code.

    TODO: add detection based on non-SPDX keywords
    """

    MANUAL_KEYWORD_LICENSE_MAP = (
        ("Public Domain", "GPL-2.0-only"),
        ("GPLv2", "GPL-2.0-only"),
        ("GPL v2", "GPL-2.0-only"),
        ("GPL (v2)", "GPL-2.0-only"),
        ("GPLv2+", "GPL-2.0-or-later"),
        ("GPL v2+", "GPL-2.0-or-later"),
        ("GNU General Public License, version 2 or later", "GPL-2.0-or-later"),
        ("GNU General Public License, version 2", "GPL-2.0-only"),
        ("Free Software Foundation; version 2 only", "GPL-2.0-only"),
        ("version 2 dated June,", "GPL-2.0-only"),
        ("either version 3 of the License,", "GPL-2.0-or-later"),
        ("either version 3 of the License,", "GPL-3.0-or-later"),
        ("GPLv3", "GPL-3.0-only"),
        ("GPL v3", "GPL-3.0-only"),
        ("GPL (v3)", "GPL-3.0-only"),
        ("GPLv3+", "GPL-3.0-or-later"),
        ("GPL v3+", "GPL-3.0-or-later"),
        ("LGPL", "LGPL-2.0-only"),
        ("LGPLv3", "LGPL-3.0-only"),
        ("LGPL v3", "LGPL-3.0-only"),
        ("LGPL (v3)", "LGPL-3.0-only"),
        ("use, copy, and modify this software with or without fee, provided", "ISC"),
        ("Attribution-ShareAlike 1.0", "CC-BY-1.0"),
        ("Attribution-ShareAlike 2.0", "CC-BY-2.0"),
        ("Attribution-ShareAlike 2.5", "CC-BY-2.5"),
        ("http://creativecommons.org/licenses/by-sa/3.0/", "CC-BY-3.0"),
        ("Attribution-ShareAlike 3.0", "CC-BY-3.0"),
        ("http://creativecommons.org/licenses/by-sa/4.0/", "CC-BY-3.0"),
        ("Attribution-ShareAlike 4.0", "CC-BY-4.0"),
    )

    def __init__(self):
        licenses = {}
        for item in self._download_license_data():
            license_id = item["licenseId"]
            name = item["name"]
            references = item["seeAlso"]
            url = references[0] if references else None
            licenses[license_id] = LicenseInformation(name, license_id, url)
        regexes = []
        for license in licenses.values():
            regex = re.compile(r"\b{}\b".format(re.escape(license.key)))
            regexes.append((regex, 10, license))
            # add optional alias for "*-or-later"
            if license.key.endswith("+"):
                regex = re.compile(
                    r"\b{}-or-later\b".format(re.escape(license.key[:-1]))
                )
                regexes.append((regex, 10, license))
            if license.key.endswith("-or-later"):
                regex = re.compile(r"\b{}\+\b".format(re.escape(license.key[:-9])))
                regexes.append((regex, 10, license))
        for keyword, license_id in self.MANUAL_KEYWORD_LICENSE_MAP:
            regex = re.compile(r"\b{}\b".format(re.escape(keyword)))
            regexes.append((regex, 5, licenses[license_id]))
        # sort regexes by priority (higher first) and length (higher first)
        regexes.sort(key=lambda item: (-item[1], -len(item[0].pattern)))
        # remove priority from list (it was only used for sorting)
        self.license_regexes = tuple(
            (regex, license) for regex, priority, license in regexes
        )

    @staticmethod
    def _download_license_data():
        try:
            with urllib.request.urlopen(SPDX_LICENSE_DATA_URL) as download:
                raw = download.read()
        except IOError as exc:
            logging.warning(
                "Failed to download SPDX license data (%s): %s",
                SPDX_LICENSE_DATA_URL,
                exc,
            )
            return {}
        try:
            data = json.loads(raw)
        except ValueError as exc:
            logging.warning(
                "Failed to parse SPDX license data (%s): %s", SPDX_LICENSE_DATA_URL, exc
            )
            return {}
        try:
            return data["licenses"]
        except KeyError:
            logging.warning(
                "Invalid SPDX license data format (%s): missing key 'licenses'",
                SPDX_LICENSE_DATA_URL,
            )
            return {}

    def parse_code(self, code):
        # join lines and remove line comment indicators
        processed = " ".join(re.sub(r"^#\s*", "", line) for line in code.splitlines())
        for regex, license in self.license_regexes:
            if regex.search(processed):
                return license
        else:
            return None


class MuninPlugin:

    # special periods (day, week, month, year) and numbers are supported
    EXAMPLE_GRAPH_SUFFIX_REGEX = r"-(day|week|month|year|\d+).png"
    # the "stable-2.0" branch of the core repository uses a ".in" suffix for all plugin files
    OPTIONAL_PLUGIN_FILENAME_SUFFIXES = (".in",)
    FAMILY_REGEX = re.compile(r"^.*#%#\s*family\s*=\s*(.+)$")
    CAPABILITIES_HEADER_REGEX = re.compile(r"^.*#%#\s*capabilities\s*=\s*(.+)$")
    # the following words are just good indicators of capabilities- not a real proof
    CAPABILITIES_INDICATOR_REGEXES = {
        "multigraph": re.compile(
            r"\b(need_multigraph|multigraph)\b", flags=re.IGNORECASE
        ),
        "dirtyconfig": re.compile(r"\bMUNIN_CAP_DIRTYCONFIG\b"),
    }
    # Most plugins contain a description in the first few lines ("NAME - SUMMARY ...").
    # Some irrelevant tokens (e.g. the prefix "Munin Plugin to" or a trailing dot) are ignored.
    SUMMARY_REGEX = re.compile(
        r"^[\w\-\\]+\s+-\s+(Munin )?((Plugin|Script) )?(to )?(?P<summary>.*?)\.?$",
        flags=re.IGNORECASE,
    )
    CATEGORY_LINE_BLACKLIST_REGEXES = (
        re.compile(r"(?:label|documentation|\bthe\b|filterwarnings)"),
        # ignore existing ambiguous word combinations
        re.compile(
            r"(?:env\.category|/category/|category queries|category\.|force_category)"
        ),
        # ignore SQL expressions
        re.compile(r"select.*from.*(?:join|where)"),
        # ignore any kind of comments
        re.compile(r"^\s*(?:#|//|/\*)"),
        # no variable may be part of the category name
        re.compile(r"category.*[&\$]"),
    )
    CATEGORY_REGEX = re.compile(
        r"^(?P<line>.*[^$.]category[^\w\n]+(?P<category>\w+).*)$", flags=re.MULTILINE
    )
    KEYWORDS_REMOVAL_REGEXES = (
        # the munin repository groups plugins by operating system
        re.compile(r"^node\.d\."),
        # omit the platform-independent plugin directory name used in the munin repository
        re.compile(r"^node\.d$"),
        # remove the "current directory"
        re.compile(r"^\.$"),
    )
    # http://guide.munin-monitoring.org/en/latest/reference/graph-category.html#well-known-categories
    WELL_KNOWN_CATEGORIES = {
        "1sec",
        "antivirus",
        "appserver",
        "auth",
        "backup",
        "chat",
        "cloud",
        "cms",
        "cpu",
        "db",
        "devel",
        "disk",
        "dns",
        "filetransfer",
        "forum",
        "fs",
        "fw",
        "games",
        "htc",
        "loadbalancer",
        "mail",
        "mailinglist",
        "memory",
        "munin",
        "network",
        "other",
        "printing",
        "processes",
        "radio",
        "san",
        "search",
        "security",
        "sensors",
        "spamfilter",
        "streaming",
        "system",
        "time",
        "tv",
        "virtualization",
        "voip",
        "webserver",
        "wiki",
        "wireless",
    }
    # the list of mappings is ordered
    IMPLEMENTATION_LANGUAGE_REGEXES = {
        "awk": re.compile(r"\W(g|m)?awk(\W|$)"),
        "bash": re.compile(r"\Wbash(\W|$)"),
        "ksh": re.compile(r"\Wksh(\W|$)"),
        "perl": re.compile(r"\Wperl(\W|$)"),
        "php": re.compile(r"\Wphp"),
        "python2": re.compile(r"\Wpython2?(\W|$)"),
        "python3": re.compile(r"\Wpython3"),
        "ruby": re.compile(r"\Wj?ruby"),
        "sh": re.compile(r"\Wsh(\W|$)"),
        "zsh": re.compile(r"\Wzsh(\W|$)"),
    }
    HEADING_REGEX = re.compile(r"^(#+.*)$", flags=re.MULTILINE)
    CAPITALIZATION_UPPER = {"IP", "TCP", "UDP"}
    CAPITALIZATION_LOWER = {"a", "the", "in", "for", "to", "and"}
    PREPROCESSING_SHEBANG_SUBSTITUTIONS = (
        (re.compile(r"^#!@@BASH@@"), "#!/bin/bash"),
        (re.compile(r"^#!@@GOODSH@@"), "#!/bin/sh"),
        (re.compile(r"^#!@@PERL@@"), "#!/usr/bin/perl"),
        (re.compile(r"^#!@@PYTHON@@"), "#!/usr/bin/env python3"),
        (re.compile(r"^#!@@RUBY@@"), "#!/usr/bin/ruby"),
    )
    COPYRIGHT_REGEX = re.compile(
        r"^.{0,15}(?:Copyright|Copyleft|copying|\(c\)|Author:)(?:\s+\(c\))?(?:\s+\d+(?:-\d+)?,?)?"
        r"(?:\s+-)?\s+((?:\w[\w.-]*\s){0,2}\w[\w.-]*?)(?:,.*|\s+[\d\-,<\(].*|\.|)$",
        flags=re.IGNORECASE,
    )
    AUTHOR_HEADING_START_REGEX = re.compile(
        r"^=head1 (AUTHORS?|COPYRIGHT)$", flags=re.IGNORECASE
    )
    AUTHOR_HEADING_END_REGEX = re.compile(r"^=(head|cut)", flags=re.IGNORECASE)
    # we expect up to three name components - anything else is probably a textual description
    AUTHOR_BARE_NAME_REGEX = re.compile(
        r"^\s*"
        r"(?:(?:.*:|(?:Copyright|Copyleft|copying)(?:\s+\(c\))?|\(c\)|\(\d+\)|contributed by)\s+)?"
        r"(?:\d+(?:-\d+)?,?\s+)?(?:-\s+)?"
        r"((?:\w[\w.-]*\s){0,2}\w[\w.-]*?)"
        r"(?:\s+[\d\-,<\(].*|\.|)?$",
        flags=re.IGNORECASE,
    )

    def __init__(
        self,
        plugin_filename,
        repository_source=None,
        name=None,
        language=None,
        license_parser=None,
    ):
        self.plugin_filename = plugin_filename
        self.repository_source = repository_source
        self.name = os.path.basename(plugin_filename) if name is None else name
        self.implementation_language = language
        for suffix in self.OPTIONAL_PLUGIN_FILENAME_SUFFIXES:
            if self.name.endswith(suffix):
                self.name = self.name[: -len(suffix)]
        self.example_graphs = self._find_images()
        self.license_parser = license_parser
        self._is_initialized = False

    def _find_images(self):
        example_graphs = []
        example_graph_directory = os.path.join(
            os.path.dirname(self.plugin_filename), EXAMPLE_GRAPH_DIRECTORY_NAME
        )
        example_graph_filename_pattern = re.compile(
            self.name + self.EXAMPLE_GRAPH_SUFFIX_REGEX
        )
        try:
            graph_filenames = os.listdir(example_graph_directory)
        except OSError:
            graph_filenames = []
        for graph_filename in graph_filenames:
            match = example_graph_filename_pattern.match(graph_filename)
            if match:
                image_key = match.groups()[0]
                example_graphs.append(
                    MuninPluginExampleGraph(
                        image_key, os.path.join(example_graph_directory, graph_filename)
                    )
                )
        example_graphs.sort()
        return example_graphs

    async def initialize(self):
        if not self._is_initialized:
            with open(self.plugin_filename, "rb") as raw:
                raw_content = raw.read().decode(errors="ignore")
            self.plugin_code = self._preprocess_raw_code(raw_content)
            self.documentation = await self._parse_documentation()
            self.family = self._parse_family()
            self.capabilities = self._parse_capabilities()
            self.categories = self._parse_categories()
            if self.repository_source:
                self.changed_timestamp = (
                    await self.repository_source.get_file_timestamp(
                        self.plugin_filename
                    )
                )
            else:
                self.changed_timestamp = None
            self.path_keywords = tuple(self._get_keywords())
            self.summary = self._guess_summary()
            self.authors = await self._guess_authors()
            self.license = self.license_parser.parse_code(self.plugin_code)
            if self.implementation_language is None:
                self.implementation_language = self._parse_implementation_language()
            self._is_initialized = True

    @classmethod
    def _preprocess_raw_code(cls, raw_code):
        """replace specific patterns (e.g. the pre-substituted shebangs for munin-2.0 plugins)"""
        result = raw_code
        for pattern, replacement in cls.PREPROCESSING_SHEBANG_SUBSTITUTIONS:
            result = pattern.sub(replacement, result)
        return result

    def _get_keywords(self):
        if self.repository_source:
            relative_path = self.repository_source.get_relative_path(
                os.path.dirname(self.plugin_filename)
            )
        else:
            relative_path = ""
        for token in relative_path.lower().split(os.path.sep):
            for regex in self.KEYWORDS_REMOVAL_REGEXES:
                token = regex.sub("", token)
            if token:
                yield token

    def _guess_summary(self):
        # we expect the summary within the first few lines of the documentation
        if not self.documentation:
            return None
        for line in self.documentation.splitlines()[:10]:
            match = self.SUMMARY_REGEX.search(line)
            if match:
                return match.groupdict()["summary"]
        else:
            return None

    async def _guess_authors(self):
        def split_and_maybe_add(result, text):
            """split the text into into multiple authors and add new authors to the list"""
            tokens = [text]
            for tokenizer in (",", "/", " and "):
                new_tokens = []
                for token in tokens:
                    new_tokens.extend(token.split(tokenizer))
                tokens = new_tokens
            for token in tokens:
                # remove "year" from copyright statements
                token = re.sub(r"\b\d[\d-]+\b", "", token)
                token = token.strip()
                token = token.strip(".")
                for ignore_suffix in [" and others", ", changed by me"]:
                    if token.endswith(ignore_suffix):
                        token = token[: -len(ignore_suffix)]
                if token in {"HOLDERS BE LIABLE FOR ANY CLAIM", "LICENSE", "Copyright"}:
                    # ignore misleading terms following the word "COPYRIGHT"
                    pass
                elif not token:
                    # ignore empty strings
                    pass
                elif token[0].islower():
                    # terms starting with a lowercase letter are probably not names
                    pass
                elif token in result:
                    # do not add duplicates
                    pass
                else:
                    result.append(token)

        result = []
        is_in_author_header = False
        for line in self.plugin_code.splitlines():
            match = self.COPYRIGHT_REGEX.search(line)
            if match:
                split_and_maybe_add(result, match.groups()[0].strip())
            elif not is_in_author_header:
                match = self.AUTHOR_HEADING_START_REGEX.search(line)
                if match:
                    is_in_author_header = True
            elif is_in_author_header:
                match = self.AUTHOR_HEADING_END_REGEX.search(line)
                if match:
                    is_in_author_header = False
                else:
                    match = self.AUTHOR_BARE_NAME_REGEX.search(line)
                    if match:
                        author_name = match.groups()[0].strip()
                        if author_name not in result:
                            for item in (
                                "known",
                                "contribut",
                                "bears no resemblance",
                                "license",
                                "all rights reserved",
                                "left join",
                                "linpro as",
                                "_",
                            ):
                                if item in author_name.lower():
                                    break
                            else:
                                split_and_maybe_add(result, author_name)
        return result

    @classmethod
    def _rewrite_match_capitalization(cls, match):
        """downgrade the capitalization of each word of the match"""
        result = []
        for token in match.groups()[0].split():
            if token.upper() in cls.CAPITALIZATION_UPPER:
                # upper case for specific words (e.g. "IP")
                token = token.upper()
            elif token.lower() in cls.CAPITALIZATION_LOWER:
                # lower case for all trivial words
                token = token.lower()
            else:
                # capitalize only the first letter for all other words
                token = token.title()
            result.append(token)
        return " ".join(result)

    async def _parse_documentation(self):
        """parse the documentation and return a markdown formatted text"""
        # quickly scan the file content in order to skip "perldoc" for files without documentation
        if "=head1" not in self.plugin_code:
            return None
        if "ruby" in self.plugin_code[:20]:
            # Ruby's multiline comment format ends with "=end" instead of "=cut".  Sadly there
            # seems to be no way to embed an "=end" without breaking the ruby interpreting.
            # Without adding "=cut", the markdown conversion would end with the full plugin code.
            result_lines = []
            for line in self.plugin_code.splitlines():
                if line == "=begin":
                    # "=begin" is a special string for ruby, but is invalid perlpod syntax, since
                    # it lacks the format specified.  Thus we replace it with a generic "start"
                    # marker suitable for perlpod.
                    result_lines.append("=pod")
                elif line == "=end":
                    # The "=end" directive closes a previous "=begin" directive.  It indicates the
                    # end of the documentation (for ruby).  Thus we replace it with perlpod's
                    # marker for the end of documentation.
                    result_lines.append("=cut")
                else:
                    result_lines.append(line)
            plugin_code = os.linesep.join(result_lines)
        else:
            plugin_code = self.plugin_code
        # Enforce utf8 input encoding (if no encoding was specified).
        # Otherwise perldoc would complain about non-utf8 characters.
        if "=encoding" not in plugin_code:
            plugin_code = re.sub(
                r"^=head1",
                os.linesep.join(("=encoding utf8", "", "=head1")),
                plugin_code,
                count=1,
                flags=re.MULTILINE,
            )
        try:
            process = await asyncio.subprocess.create_subprocess_exec(
                *("pod2markdown", "--utf8"),
                stdin=asyncio.subprocess.PIPE,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
        except OSError as exc:
            logging.warning("Failed to execute pod2markdown: {}".format(exc))
            return None
        stdout, stderr = await process.communicate(plugin_code.encode())
        if process.returncode != 0:
            logging.info(
                "Failed to generate documentation for plugin '%s': %s",
                self.name,
                stderr.decode(),
            )
            return None
        documentation = stdout.decode()
        # remove empty lines and whitespace and the beginning and end
        documentation = documentation.strip()
        # fix all-uppercase style (e.g. "NAME" -> "Name")
        documentation = self.HEADING_REGEX.sub(
            self._rewrite_match_capitalization, documentation
        )
        # reduce the level of all headings (the template applies level 1 to the plugin title)
        documentation = re.sub(r"^#", "##", documentation, flags=re.MULTILINE)
        # TODO: add some post-processing
        return documentation

    def _parse_capabilities(self):
        result = set()
        for line in self.plugin_code.splitlines():
            match = self.CAPABILITIES_HEADER_REGEX.search(line)
            if match:
                result.update(match.groups()[0].strip().lower().split())
                break
        for capability, regex in self.CAPABILITIES_INDICATOR_REGEXES.items():
            if regex.search(self.plugin_code):
                result.add(capability)
        # The "wildcard" configuration ability is not really a capability. But this is the least
        # unsuitable place to indicate this behaviour
        if self.plugin_filename.endswith("_"):
            result.add("wildcard")
        return tuple(sorted(result))

    def _parse_family(self):
        for line in self.plugin_code.splitlines():
            match = self.FAMILY_REGEX.search(line)
            if match:
                return match.groups()[0].strip().lower()
        else:
            return None

    def _parse_categories(self):
        categories = set()
        for line, category in self.CATEGORY_REGEX.findall(self.plugin_code):
            if len(line.splitlines()) != 1:
                continue
            if any(
                blacklist_regex.search(line)
                for blacklist_regex in self.CATEGORY_LINE_BLACKLIST_REGEXES
            ):
                continue
            categories.add(category.lower())
        return tuple(sorted(categories))

    def _parse_implementation_language(self):
        first_line = self.plugin_code.splitlines()[0]
        for name, regex in self.IMPLEMENTATION_LANGUAGE_REGEXES.items():
            if regex.search(first_line):
                return name
        else:
            return None

    def get_unexpected_categories(self):
        return sorted(set(self.categories).difference(self.WELL_KNOWN_CATEGORIES))

    def get_details(self):
        return {
            "documentation": bool(self.documentation),
            "family": self.family,
            "capabilities": self.capabilities,
            "categories": set(self.categories),
            "keywords": set(self.path_keywords),
            "unexpected_categories": self.get_unexpected_categories(),
            "authors": tuple(self.authors),
            "image_filenames": dict(self._image_filenames),
            "changed_timestamp": self.changed_timestamp,
        }

    def __str__(self):
        if self._image_filenames:
            return "Plugin '{:s}' ({:d} example graphs)".format(
                self.name, len(self._image_filenames)
            )
        else:
            return "Plugin '{:s}'".format(self.name)


class MuninPluginSource:
    def __init__(
        self,
        name,
        source_type,
        location,
        git_branch=None,
        source_path=None,
        ignore_files=None,
    ):
        self.name = name
        self._source_type = source_type
        self._source_location = location
        self._branch = git_branch
        self._filter_path = source_path or os.path.curdir
        self._ignore_files = set(ignore_files or [])
        self._is_downloaded = False

    async def initialize(self):
        if not self._is_downloaded:
            self._extract_directory = tempfile.mkdtemp(prefix="munin-gallery-")
            if self._source_type == RepositorySourceType.GIT:
                self._plugins_directory = await self._import_git_repository(
                    self._extract_directory,
                    self._source_location,
                    self._branch,
                    path=self._filter_path,
                )
            elif self._source_type == RepositorySourceType.ARCHIVE:
                self._plugins_directory = await self._import_archive(
                    self._extract_directory,
                    self._source_location,
                    path=self._filter_path,
                )
            elif self._source_type == RepositorySourceType.DIRECTORY:
                self._plugins_directory = os.path.join(
                    self._source_location, self._filter_path
                )
            else:
                raise ValueError("Invalid source type: {}".format(self._source_type))
            self._is_downloaded = True

    def __del__(self):
        if self._is_downloaded:
            shutil.rmtree(self._extract_directory, ignore_errors=True)
            del self._extract_directory
            del self._plugins_directory
            self._is_downloaded = False

    async def _get_git_file_timestamp(self, filename):
        """retrieve the timestamp of the most recent commit affecting the filename"""
        dirname, basename = os.path.dirname(filename), os.path.basename(filename)
        try:
            process = await asyncio.subprocess.create_subprocess_exec(
                *(
                    "git",
                    "log",
                    "-n",
                    "1",
                    "--no-merges",
                    "--format=format:%aI",
                    "--",
                    basename,
                ),
                cwd=dirname,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
        except OSError:
            logging.warning(
                "Failed to run 'git log' while retrieving the timestamp of '{}'.".format(
                    filename
                )
            )
            return None
        timestamp_raw, error_output = await process.communicate()
        if process.returncode != 0:
            logging.warning(
                "Failed to retrieve the timestamp of '{}' via 'git log': {}".format(
                    filename, error_output.decode()
                )
            )
            return None
        try:
            return datetime.datetime.fromisoformat(timestamp_raw.decode())
        except ValueError:
            logging.warning(
                "Failed to parse file timestamp of '{}': {}".format(
                    filename, timestamp_raw
                )
            )
            return None

    async def get_file_timestamp(self, filename):
        if self._source_type == RepositorySourceType.GIT:
            return await self._get_git_file_timestamp(filename)
        elif self._source_type == RepositorySourceType.ARCHIVE:
            # github's tar archive does not support proper file timestamps
            return None
        elif self._source_type == RepositorySourceType.DIRECTORY:
            stat = os.stat(filename, follow_symlinks=True)
            return datetime.datetime.fromtimestamp(stat.st_ctime).astimezone()
        else:
            raise ValueError("Invalid source type: {}".format(self._source_type))

    @staticmethod
    async def _import_git_repository(
        target_directory, repository_url, branch, path=None
    ):
        if path.rstrip(os.path.sep) == os.path.curdir:
            path = None
        try:
            # we cannot use "--depth=1", since we are interested in the file timestamps
            process = await asyncio.subprocess.create_subprocess_exec(
                *(
                    "git",
                    "clone",
                    "--single-branch",
                    "--branch",
                    branch,
                    repository_url,
                    target_directory,
                ),
                stderr=asyncio.subprocess.PIPE,
            )
        except OSError as exc:
            raise MuninPluginRepositoryProcessingError(
                "Failed to spawn process for repository retrieval (git): {}".format(exc)
            )
        await process.wait()
        if process.returncode == 0:
            return os.path.join(target_directory, path) if path else target_directory
        else:
            raise MuninPluginRepositoryProcessingError(
                "Failed to extract source archive ({}): {}".format(
                    repository_url, (await process.stderr.read()).decode()
                )
            )

    @staticmethod
    async def _import_archive(target_directory, archive_url, path=None):
        if path.rstrip(os.path.sep) == os.path.curdir:
            path = None
        # Strip the top-level path before extracting. Github assembles the name of this path
        # component by combining the repository name and the branch name.
        extract_command = [
            "tar",
            "--extract",
            "--gzip",
            "--strip-components=1",
            "--directory",
            target_directory,
        ]
        if path:
            # extract the specified path and ignore the top-level directory of github's archive
            extract_command.extend(["--wildcards", os.path.join("*", path)])
        try:
            process = await asyncio.subprocess.create_subprocess_exec(
                *extract_command,
                stdin=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
        except OSError as exc:
            raise MuninPluginRepositoryProcessingError(
                "Failed to spawn process for archival extraction (tar): {}".format(exc)
            )
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(archive_url) as response:
                    while True:
                        chunk = await response.content.read(256 * 1024)
                        if chunk:
                            process.stdin.write(chunk)
                        else:
                            break
                    process.stdin.close()
        except IOError as exc:
            raise MuninPluginRepositoryProcessingError(
                "Failed to download source archive from '{}': {}'".format(
                    archive_url, exc
                )
            )
        await process.wait()
        if process.returncode == 0:
            return os.path.join(target_directory, path if path else os.path.curdir)
        else:
            raise MuninPluginRepositoryProcessingError(
                "Failed to extract source archive ({}): {}".format(
                    archive_url, (await process.stderr.read()).decode()
                )
            )

    async def get_plugins(self, license_parser=None):
        await self.initialize()
        for dirpath, dirnames, filenames in os.walk(self._plugins_directory):
            if os.path.basename(dirpath) in {
                EXAMPLE_GRAPH_DIRECTORY_NAME,
                "node.d.debug",
            }:
                # example graph directories are not expected to contain plugins
                continue
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                relative_path = str(
                    pathlib.Path(full_path).relative_to(self._plugins_directory)
                )
                if relative_path in self._ignore_files:
                    continue
                try:
                    status = os.stat(full_path, follow_symlinks=False)
                except OSError:
                    pass
                # every executable file is assumed to be a plugin
                if status.st_mode & 0o100 > 0:
                    yield MuninPlugin(full_path, self, license_parser=license_parser)
                elif filename.endswith(".in"):
                    # the plugin files in the stable-2.0 repository are not executable
                    yield MuninPlugin(
                        full_path,
                        repository_source=self,
                        name=filename[:-3],
                        license_parser=license_parser,
                    )
                elif filename.endswith(".c"):
                    yield MuninPlugin(
                        full_path,
                        repository_source=self,
                        name=filename[:-2],
                        language="c",
                        license_parser=license_parser,
                    )
                elif filename.endswith(".cpp"):
                    yield MuninPlugin(
                        full_path,
                        repository_source=self,
                        name=filename[:-4],
                        language="cpp",
                        license_parser=license_parser,
                    )
                else:
                    # this file is probably not a plugin
                    pass

    def get_relative_path(self, path):
        return str(pathlib.Path(path).relative_to(self._plugins_directory))


class ContentIndexer:
    """a trivial content indexer for reducing the given text to a minimal set of words

    Specific lines, special characters and superfluous whitespace is removed.
    """

    # ignore lines with headings and magic markers
    IGNORE_LINE_REGEX = re.compile(r"(^#|^\s+#%#)")
    REMOVAL_REGEXES = (
        # Remove common (very unspecific) words.  These words are parsed from a separate file.
        re.compile(
            r"\b({})\b".format(
                "|".join(open(INDEXING_IGNORE_WORDS_FILE, "r").read().splitlines())
            ),
            flags=re.IGNORECASE,
        ),
        # remove single-letter words/digits and dots
        re.compile(r"\b(\.+|\w)\b", flags=re.IGNORECASE),
        # remove all special characters
        re.compile(r"[^\w\s.]"),
        re.compile(r"\bSPDX-License-Identifier:\b"),
    )
    MERGE_WHITESPACE_REGEX = re.compile(r"\s+")

    @classmethod
    def get_indexing_content(cls, text):
        result = []
        for line in text.splitlines():
            if cls.IGNORE_LINE_REGEX.search(line):
                continue
            for regex in cls.REMOVAL_REGEXES:
                line = regex.sub(" ", line)
            line = cls.MERGE_WHITESPACE_REGEX.sub(" ", line)
            line = line.strip()
            if line:
                result.append(line)
        return " ".join(result)


class MuninPluginsHugoExport:

    MISSING_DOCUMENTATION_TEXT = "Sadly there is no documentation for this plugin."
    PLUGINS_SUBDIRECTORY = "plugins"

    def __init__(
        self,
        hugo_directory,
        export_directory,
        baseurl="http://localhost/",
        hugo_environment="production",
        clean_export_directory=True,
    ):
        self._hugo_directory = hugo_directory
        self._baseurl = baseurl
        self._hugo_environment = hugo_environment
        self._export_directory = export_directory
        self.plugins = []
        if clean_export_directory:
            if os.path.exists(self._export_directory):
                shutil.rmtree(self._export_directory)

    async def _run_hugo(self, action=None, hide_output=True):
        call_args = [
            "hugo",
            "--baseURL",
            self._baseurl,
            "--environment",
            self._hugo_environment,
        ]
        if action:
            call_args.append(action)
        call_kwargs = {}
        if hide_output:
            call_kwargs["stdout"] = asyncio.subprocess.PIPE
        try:
            process = await asyncio.subprocess.create_subprocess_exec(
                *call_args,
                **call_kwargs,
                cwd=self._hugo_directory,
                stderr=asyncio.subprocess.PIPE,
            )
        except OSError as exc:
            logging.error("Failed to run 'hugo': {}".format(exc))
            return False
        await process.wait()
        if process.returncode == 0:
            return True
        else:
            logging.error(
                "Failed to build hugo site: {}".format(
                    (await process.stderr.read()).decode()
                )
            )
            return False

    async def build(self):
        return await self._run_hugo()

    async def serve(self):
        return await self._run_hugo("serve", hide_output=False)

    def get_metadata(self, plugin):
        result = {
            "title": plugin.name,
        }
        missing_details = set()
        if plugin.repository_source:
            result["repositories"] = [plugin.repository_source.name]
        if plugin.changed_timestamp:
            result["date"] = plugin.changed_timestamp.isoformat(timespec="seconds")
        if plugin.summary:
            result["summary"] = plugin.summary
        else:
            missing_details.add("summary")
        if plugin.categories:
            result["categories"] = tuple(plugin.categories)
        else:
            missing_details.add("categories")
        if plugin.family:
            result["families"] = [plugin.family]
        if plugin.capabilities:
            result["capabilities"] = tuple(plugin.capabilities)
        if plugin.path_keywords:
            result["keywords"] = tuple(sorted(plugin.path_keywords))
        if plugin.implementation_language:
            result["implementation_languages"] = [plugin.implementation_language]
        else:
            missing_details.add("implementation_language")
        if plugin.documentation:
            result["indexing_content"] = ContentIndexer.get_indexing_content(
                plugin.documentation
            )
        else:
            missing_details.add("documentation")
        if plugin.authors:
            result["authors"] = tuple(plugin.authors)
        else:
            missing_details.add("author")
        if not plugin.example_graphs:
            missing_details.add("example graphs")
        if plugin.license:
            result["licenses"] = [plugin.license.key]
        else:
            missing_details.add("license")
        result["missing_details"] = sorted(missing_details)
        return result

    @staticmethod
    def _set_timestamp_of_plugin(path, plugin):
        if plugin.changed_timestamp:
            os.utime(path, tuple(2 * [int(plugin.changed_timestamp.timestamp())]))

    def add(self, plugin):
        self.plugins.append(plugin)

    async def export_plugin(self, plugin):
        if plugin.repository_source:
            plugin_directory = os.path.join(
                self._export_directory, plugin.repository_source.name, plugin.name
            )
        else:
            plugin_directory = os.path.join(self._export_directory, plugin.name)
        try:
            os.makedirs(plugin_directory, exist_ok=True)
        except OSError as exc:
            logging.warning(
                "Failed to create hugo plugin directory ({}): {}".format(
                    plugin_directory, exc
                )
            )
            return False
        source_path = os.path.join(plugin_directory, "source")
        shutil.copy(plugin.plugin_filename, source_path)
        self._set_timestamp_of_plugin(source_path, plugin)
        local_graphs = []
        for graph in plugin.example_graphs:
            destination = os.path.join(
                plugin_directory, graph.key + os.path.splitext(graph.filename)[1]
            )
            shutil.copy(graph.filename, destination)
            self._set_timestamp_of_plugin(destination, plugin)
            local_graphs.append(
                {
                    "key": graph.key,
                    "path": os.path.basename(destination),
                }
            )
        export_filename = os.path.join(plugin_directory, "index.md")
        with open(export_filename, "w") as plugin_file:
            plugin_file.write("---" + os.linesep)
            meta_data = self.get_metadata(plugin)
            if local_graphs:
                meta_data["example_graphs"] = local_graphs
            plugin_file.write(yaml.dump(meta_data, Dumper=YamlDataDumper, indent=4))
            plugin_file.write("---" + os.linesep)
            if plugin.documentation:
                plugin_file.write(plugin.documentation + os.linesep)
            else:
                plugin_file.write(self.MISSING_DOCUMENTATION_TEXT + os.linesep)
                # prevent hugo from treating the plugin code as being part of the summary
                plugin_file.write("<!--more-->" + os.linesep)
            show_source_format_string = """
{{< collapse title="Source Code" >}}
{{< code lang="%(language)s" file="/%(path)s" >}}
{{< /collapse >}}
"""
            plugin_file.write(
                show_source_format_string
                % {
                    "language": plugin.implementation_language or "",
                    "path": os.path.join(
                        self.PLUGINS_SUBDIRECTORY,
                        *pathlib.Path(source_path)
                        .relative_to(self._export_directory)
                        .parts,
                    ),
                }
            )
        self._set_timestamp_of_plugin(export_filename, plugin)
        self._set_timestamp_of_plugin(plugin_directory, plugin)

    def get_statistics(self):
        return {
            "all": len(self.plugins),
            "missing_documentation": len(
                [p for p in self.plugins if not p.documentation]
            ),
            "missing_family": len([p for p in self.plugins if not p.family]),
            "missing_capabilities": len(
                [p for p in self.plugins if not p.capabilities]
            ),
            "missing_summary": len([p for p in self.plugins if not p.summary]),
            # TODO: evaluate
            "unexpected_categories": len(
                [p for p in self.plugins if p.get_unexpected_categories()]
            ),
        }


async def import_plugin_source_archive(
    plugin_source, plugin_queue, license_parser=None
):
    try:
        async for plugin in plugin_source.get_plugins(license_parser=license_parser):
            logging.info("Adding plugin '{}'".format(plugin.name))
            await plugin_queue.put(plugin)
    except Exception as exc:
        logging.error(
            "Failed to import plugin source archive ({}): {}".format(
                plugin_source.name, exc
            )
        )


async def worker_initialize_plugins(jobs, destination):
    while True:
        plugin = await jobs.get()
        try:
            await plugin.initialize()
        except Exception as exc:
            logging.warning(
                "Failed to initialize plugin ({}): {}".format(plugin.name, exc)
            )
            import traceback

            logging.warning(traceback.format_exc())
        else:
            pending_count = jobs.qsize()
            done_count = destination.qsize()
            logging.info(
                "[{:d}/{:d}] Plugin '{}' finished".format(
                    done_count, pending_count + done_count, plugin.name
                )
            )
            await destination.put(plugin)
        finally:
            jobs.task_done()


async def worker_export_plugins_to_hugo(exporter, input_queue):
    while True:
        new_plugin = await input_queue.get()
        exporter.add(new_plugin)
        try:
            await exporter.export_plugin(new_plugin)
        except Exception as exc:
            logging.error("Failed to add plugin to exporter: {}".format(exc))
        input_queue.task_done()


async def import_plugins(plugin_sources, initialized_plugins):
    pending_plugins = asyncio.Queue()
    plugin_source_workers = []
    license_parser = LicenseParser()
    for source in plugin_sources:
        task = asyncio.create_task(
            import_plugin_source_archive(
                source, pending_plugins, license_parser=license_parser
            )
        )
        plugin_source_workers.append(task)
    plugin_workers = []
    for _ in range(multiprocessing.cpu_count()):
        task = asyncio.create_task(
            worker_initialize_plugins(pending_plugins, initialized_plugins)
        )
        plugin_workers.append(task)
    await asyncio.gather(*plugin_source_workers, return_exceptions=True)
    await pending_plugins.join()


def get_plugin_statistics(plugins):
    statistics = {
        "all": [],
        "missing_documentation": [],
        "missing_family": [],
        "missing_capabilities": [],
        "missing_summary": [],
        "unknown_implementation_language": [],
        "unexpected_categories": [],
    }
    for plugin in plugins:
        statistics["all"].append(plugin)
        if not plugin.documentation:
            statistics["missing_documentation"].append(plugin)
        if not plugin.family:
            statistics["missing_family"].append(plugin)
        if not plugin.capabilities:
            statistics["missing_capabilities"].append(plugin)
        if not plugin.summary:
            statistics["missing_summary"].append(plugin)
        if not plugin.implementation_language:
            statistics["unknown_implementation_language"].append(plugin)
        if plugin.get_unexpected_categories():
            statistics["unexpected_categories"].append(plugin)
    return statistics


async def transfer_queue_to_list(input_queue):
    result = []
    while not input_queue.empty():
        result.append(await input_queue.get())
    return result


async def import_local_plugins(plugin_filenames):
    license_parser = LicenseParser()
    for plugin_filename in plugin_filenames:
        plugin = MuninPlugin(plugin_filename, license_parser=license_parser)
        await plugin.initialize()
        print(plugin.get_details())


async def publish_plugins_hugo(
    plugin_sources,
    export,
    skip_collect=False,
    skip_website=False,
    show_statistics=False,
):
    timing_statistics = []
    if not skip_collect:
        start_time = time.monotonic()
        loaded_plugins = asyncio.Queue()
        worker = asyncio.create_task(
            worker_export_plugins_to_hugo(export, loaded_plugins)
        )
        await import_plugins(plugin_sources, loaded_plugins)
        worker.cancel()
        timing_statistics.append(("Collect Plugins", time.monotonic() - start_time))
    if not skip_website:
        start_time = time.monotonic()
        if not await export.build():
            logging.error("Failed to build the static export")
            return False
        timing_statistics.append(("Build Website", time.monotonic() - start_time))
    if show_statistics and not skip_collect:
        for key, value in export.get_statistics().items():
            print("{}: {}".format(key, value))
        for label, duration in timing_statistics:
            print("Runtime ({:s}): {:d} seconds".format(label, int(duration)))
    return True


class CommandLineAction(enum.Enum):
    BUILD = "build"
    SERVE = "serve"


def get_arguments():
    base_dir = os.path.dirname(__file__)
    parser = argparse.ArgumentParser(
        description=(
            "Collect munin plugins from different sources and build a static website for "
            "visualizing these plugins."
        )
    )
    parser.add_argument(
        "--target-plugin-directory",
        default=os.path.join(base_dir, "build", "plugins"),
        help="Target directory for plugin collection",
    )
    parser.add_argument(
        "--template-directory",
        default=os.path.join(base_dir, "hugo-base"),
        help="Directory of the website template (hugo)",
    )
    parser.add_argument(
        "--export-url",
        default=None,
        help="Override the base URL given in the configuration file",
    )
    parser.add_argument(
        "--skip-collect", action="store_true", help="Skip the plugin collection"
    )
    parser.add_argument(
        "--skip-website", action="store_true", help="Skip the website build process"
    )
    parser.add_argument(
        "--show-statistics",
        action="store_true",
        help="Output various statistics after the build process",
    )
    parser.add_argument(
        "--show-metadata",
        action="store_true",
        help="Output the metadata of all plugins",
    )
    parser.add_argument(
        "--plugin",
        dest="plugin_filenames",
        action="append",
        default=[],
        help="Process only a single plugin instead of all configured sources",
    )
    parser.add_argument(
        "--config", default="config.yml", help="Location of the configuration file"
    )
    parser.add_argument(
        "action",
        choices=tuple(item.value for item in CommandLineAction),
        default=CommandLineAction.BUILD.value,
        nargs="?",
        help="Action to executed",
    )
    return parser.parse_args()


def load_configuration_file(filename):
    try:
        with open(filename, "r") as stream:
            content = stream.read()
    except IOError as exc:
        raise ConfigurationImportError(
            "Failed to read configuration file: {}".format(exc)
        )
    data = yaml.safe_load(content)
    # verify existence of required keys
    for key in ("sources",):
        if key not in data:
            raise RepositorySourceType(
                "Missing required key 'sources' in configuration file: {}".format(
                    filename
                )
            )
            return None
    return data


def _parse_sources_from_configuration(source_configurations):
    plugin_sources = []
    for index, source_settings in enumerate(source_configurations):
        for required_key in ("name", "type", "location"):
            if required_key not in source_settings:
                raise ConfigurationImportError(
                    "Missing key '{}' in source #{:d}".format(required_key, index)
                )
        # translate some input setting names (only for undesired variable names)
        source_type = source_settings.pop("type")
        try:
            source_settings["source_type"] = RepositorySourceType(source_type)
        except KeyError:
            raise ConfigurationImportError(
                "invalid source 'type' specified for source #{:d}: {} (allowed types: {})".format(
                    index,
                    source_type,
                    " / ".join(item.value for item in RepositorySourceType),
                )
            )
        plugin_sources.append(MuninPluginSource(**source_settings))
    return plugin_sources


def _parse_website_settings_from_configuration(website_config, baseurl_override=None):
    result = {}
    if baseurl_override is None:
        result["baseurl"] = (
            website_config.get("baseurl", "http://localhost/").rstrip("/") + "/"
        )
    else:
        result["baseurl"] = baseurl_override
    result["hugo_environment"] = website_config.get("hugo_environment", "production")
    return result


def main():
    args = get_arguments()
    wanted_action = CommandLineAction(args.action)
    if (wanted_action == CommandLineAction.SERVE) and (args.export_url is None):
        # the local "serve" feature of hugo implies a local base URL
        baseurl_override = "http://localhost:1313/"
    else:
        baseurl_override = args.export_url
    try:
        configuration = load_configuration_file(args.config)
        plugin_sources = _parse_sources_from_configuration(
            configuration.get("sources", [])
        )
        website_settings = _parse_website_settings_from_configuration(
            configuration.get("website", {}), baseurl_override=baseurl_override
        )
    except ConfigurationImportError as exc:
        logging.error(
            "Failed to import configuration file ({}): {}".format(args.config, exc)
        )
        sys.exit(2)
    if args.plugin_filenames:
        # replace the default "plugin_sources" with a limited set based on individual filenames
        plugin_sources = []
        for plugin_filename in args.plugin_filenames:
            basename = os.path.basename(plugin_filename)
            dirname = os.path.dirname(plugin_filename)
            ignore_files = set(os.listdir(dirname)).difference({basename})
            custom_source = MuninPluginSource(
                "manual",
                RepositorySourceType.DIRECTORY,
                dirname,
                ignore_files=ignore_files,
            )
            plugin_sources.append(custom_source)
    if wanted_action in {CommandLineAction.BUILD, CommandLineAction.SERVE}:
        hugo_export = MuninPluginsHugoExport(
            args.template_directory,
            args.target_plugin_directory,
            baseurl=website_settings["baseurl"],
            hugo_environment=website_settings["hugo_environment"],
            clean_export_directory=(not args.skip_collect),
        )
        action_async = publish_plugins_hugo(
            plugin_sources,
            hugo_export,
            skip_collect=args.skip_collect,
            skip_website=args.skip_website,
            show_statistics=args.show_statistics,
        )
        if not asyncio.run(action_async):
            sys.exit(1)
        if args.show_metadata:
            metadata = {}
            for plugin_source in plugin_sources:
                metadata[plugin_source.name] = {
                    plugin.name: hugo_export.get_metadata(plugin)
                    for plugin in hugo_export.plugins
                    if plugin.repository_source == plugin_source
                }
            print(yaml.dump(metadata, Dumper=YamlDataDumper, indent=4, sort_keys=True))
        if wanted_action == CommandLineAction.SERVE:
            try:
                asyncio.run(hugo_export.serve())
            except KeyboardInterrupt:
                pass


if __name__ == "__main__":
    main()