From 98f8c946ef765111f06f6b67ffc3be1e9adec717 Mon Sep 17 00:00:00 2001 From: Lauri Ojansivu Date: Fri, 25 Sep 2020 21:57:03 +0300 Subject: [PATCH] Initial commit. --- .dir-locals.el | 10 + .gitignore | 2 + LICENSE | 26 ++ Makefile | 53 +++ README | 113 +++++ pod.cpp | 1177 ++++++++++++++++++++++++++++++++++++++++++++++++ pod.hpp | 283 ++++++++++++ 7 files changed, 1664 insertions(+) create mode 100644 .dir-locals.el create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README create mode 100644 pod.cpp create mode 100644 pod.hpp diff --git a/.dir-locals.el b/.dir-locals.el new file mode 100644 index 0000000..69b378f --- /dev/null +++ b/.dir-locals.el @@ -0,0 +1,10 @@ +;;; Directory Local Variables +;;; See Info node `(emacs) Directory Variables' for more information. + +((nil . + ((indent-tabs-mode . nil))) + (c++-mode . + ((c-file-style . "stroustrup") + (c-basic-offset . 4) + (tab-width . 4) + (show-trailing-whitespace . t)))) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..51bcec0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.a +*.so diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f2e0ea4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,26 @@ +Copyright © 2019 Marvin Gülker + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d14f594 --- /dev/null +++ b/Makefile @@ -0,0 +1,53 @@ +# Copyright © 2019 Marvin Gülker +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +# OF THE POSSIBILITY OF SUCH DAMAGE. + +CC := cc +CFLAGS := -std=c++11 -Wall -Wextra +SHAREDCFLAGS := -shared -fPIC +DESTDIR := /usr/local + +all: libpod-cpp.a libpod-cpp.so + +clean: + rm -f libpod-cpp.a + rm -f libpod-cpp.so + +install: libpod-cpp.a libpod-cpp.so + mkdir -p $(DESTDIR)/lib + install -m 0644 $^ $(DESTDIR)/lib + +uninstall: + rm -f $(DESTDIR)/lib/libpod-cpp.a + rm -f $(DESTDIR)/lib/libpod-cpp.so + +libpod-cpp.a: pod.cpp pod.hpp + $(CC) -o $@ -c $(CFLAGS) $< + +libpod-cpp.so: pod.cpp pod.hpp + $(CC) -o $@ $(SHAREDCFLAGS) $(CFLAGS) $< + +.PHONY: all clean diff --git a/README b/README new file mode 100644 index 0000000..731a79f --- /dev/null +++ b/README @@ -0,0 +1,113 @@ + POD Parser + in C++ + +This is a parser for Perl's POD markup language (see +), entirely written in C++. + +This project is developed mainly for use for the generation of the +scripting API docs of The Secret Chronicles of Dr. M. (TSC; see +https://secretchronicles.org), but has been extracted into its own +project as it turned out to be a little more than a simple parser for +simple markup language. As a result of this history, the parser +is written to be used in the context of HTML generation. If you need +to generate something other than HTML, take a look at the ToHTML() +methods of the various PodNode subclasses and implement similar +ToYourOutput() methods yourself. + + - How to use - + +The C++ POD parser is written in C++11 and has no external +dependencies. It is recommended to insert the two files directly into +your project, but a simple Makefile is provided for building it as a +library so it is possible to run: + + $ make + +This is going to create a static and a shared library named +libpod-cpp.a and libpod-cpp.so, respectively. + +After building the C++ POD parser, include the header: + + #include "pod.hpp" + +Write a function that tells the parser how a file name looks like for +any given class or module name: + + std::string filename_cb(std::string classmodname) + { + return classmodname + ".html"; + } + +Write another function that tells the parser how an HTML A tag's @name +tag looks like if given a method name and the information about +whether it's a class or module method: + + std::string methodname_cb(bool cmethod, std::string methodname) + { + return cmethod ? methodname + "-cm" : methodname + "-im"; + } + +Then, instanciate the parser with the string to parse and start +the parsing process: + + PodParser parser("=head1 The title\n\nLorem ipsum dolor sit amet\n", + filename_cb, methodname_cb); + parser.parse(); + +It is then possible to retrieve the list of parsed tokens (which are +all subclasses of PodNode) and call the virtual PodNode::ToHTML() +method on all of them in order to transform the tokens into HTML: + + const std::vector tokens = parser.GetTokens(); + for(const PodNode* p_token: tokens) { + std::cout << p_token->ToHTML(); + } + +If this is the entire processing required, a convenience function +Pod::FormatHTML() exists that takes the token list as returned +by PodParser::GetTokens() and does the exact same thing like +the snippet above. + + std::cout << Pod::FormatHTML(parser.GetTokens()); + +Either way, this is going to dump the entire HTML for all the tokens +to the standard output. Note there is no need to manually insert +newlines between tokens. This is taken care of where necessary. + + - Limitations and Extensions - + +This parser is not entirely compliant with the POD specification. The +most significant diversion is probably that it is not valid to use any +kind of formatting code inside the link target of the L<> formatting +code, especially not the E<> code. + +That being said, the parser is good enough to process TSC's scripting +API documentation, which is its primary purpose. If not given too +complicated markup, there should be no problem. + +TSC's scripting API is an object-oriented Ruby API. Therefore, the POD +markup has been slightly extended to support typical Ruby API methods, +which manifests itself in a modified behaviour of the L<> code. As +with ordinary POD, an L<> formatting code without a bar (|) means an +internal link to another object in the same documentation. Format +L can also be used as normal. However, additionally, +it is possible to use an L<> code with one of the following two forms: + +1. L creates an internal link to the instance method + `imethod' of the class or module `Object'. +2. L creates an internal link to the class or module + method `cmethod' of the class or module `Object'. + +These two extensions can also be used with explicit link text, e.g.: + + L + +Problems with obscure markup may be reported, but depending on the +complexity of the problem might not be fixed if there is no chance +that this markup occurs in TSC's scripting API documentation. + + - Legal Information - + +This parser has been written by Marvin Gülker, Unna (Germany, EU). +It is licensed under the 2-clause BSD license. The exact license text +can be found in the file LICENSE. diff --git a/pod.cpp b/pod.cpp new file mode 100644 index 0000000..05bc513 --- /dev/null +++ b/pod.cpp @@ -0,0 +1,1177 @@ +/* POD format parser written in C++. + * + * Copyright © 2019 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "pod.hpp" +#include +#include +#include +#include +#include + +using namespace Pod; + +/** + * Creates a new parser for the POD format. `str' is the string to + * parse. `fcb' is a function pointer pointing to a callback function + * that takes as argument a class or module name as std::string and is + * required to return as an std::string the filename the class or + * module is documented in. The value will be used as-is in HTML A + * tags' HREF attributes, thus it is up to you to find out whether a + * relative or absolute filename is required. `mcb' is a callback + * that is expected to generate a string suitable for HTML A tags' + * HREF attributes given a method name as an std::string (without + * class/module information) and a boolean that indicates whether + * this is a class/module or instance method. + * + * Implementation notice: The design with callbacks allows to + * decouple the parser's logic completely from the generator's logic, + * while still taking advantage of the existing filename and + * method name id generation functions in the generator. + */ +PodParser::PodParser(const std::string& str, + std::string (*fcb)(std::string), + std::string (*mcb)(bool, std::string)) + : m_lino(0), + m_mode(mode::none), + m_link_bar_found(false), + m_source_markup(str), + m_filename_cb(fcb), + m_mname_cb(mcb), + m_verbatim_lead_space(0) +{ +} + +PodParser::~PodParser() +{ + for (PodNode* p_node: m_tokens) { + delete p_node; + } +} + +/** + * Clear internal state and remap the parser to point to `str'. + * Calling Parse() subsequently will parse `str' instead of what was + * passed to the constructor. + */ +void PodParser::Reset(const std::string& str) +{ + m_source_markup = str; + m_lino = 0; + m_tokens.clear(); + m_idx_keywords.clear(); +} + +/// Start the actual parsing operation (expensive, blocks). +void PodParser::Parse() +{ + if (m_source_markup.empty()) + return; + + std::stringstream ss(m_source_markup); + std::string line; + + m_mode = mode::none; + m_link_bar_found = false; + m_verbatim_lead_space = 0; + m_current_buffer.clear(); + m_data_end_tag.clear(); + m_ecode.clear(); + m_idx_kw.clear(); + + while (std::getline(ss, line)) { + m_lino++; + parse_line(line); // Note: `line' lacks terminal \n + } + + // Terminate whatever is the last element. The empty string + // is detected by all modes as a terminator. + parse_line(""); +} + +void PodParser::parse_line(const std::string& line) +{ + switch(m_mode) { + case mode::command: + if (line.empty()) { // Empty line terminates command paragraph + parse_command(m_current_buffer); + + m_mode = mode::none; + m_current_buffer.clear(); + } + else { + m_current_buffer += line + " "; // Replace end-of-line newline with space + } + break; + case mode::ordinary: + if (line.empty()) { // Empty line terminates ordinary paragraph + parse_ordinary(m_current_buffer); + m_mode = mode::none; + m_current_buffer.clear(); + } + else { + m_current_buffer += line + " "; // Replace end-of-line newline with space + } + break; + case mode::verbatim: + if (line.empty()) { // Empty line terminates verbatim paragraph + parse_verbatim(m_current_buffer); + + m_mode = mode::none; + m_current_buffer.clear(); + // Note: do not reset m_verbatim_lead_space here, it's required for a possible adjascent verbatim paragraph. + } + else { + m_current_buffer += line + "\n"; // Re-add newline at end of line + } + break; + case mode::data: + // Note: "data" mode can only be activated in parse_command() + if (line == m_data_end_tag) { // "=end " ends data mode + parse_data(m_current_buffer); + m_mode = mode::none; + m_current_buffer.clear(); + m_data_end_tag.clear(); + m_data_args.clear(); + } + else { + m_current_buffer += line + "\n"; // Re-add newline at end of line + } + break; + case mode::cut: + // Note: "cut" mode can only be activated in parse_command() + // Note2: While in "cut" mode everything other than "=pod" is ignored. + if (line == "=pod") // =pod ends cut mode + m_mode = mode::none; + break; + default: // No consumer mode active, check what's requested now (m_mode == mode::none) + switch (line[0]) { + case '\0': // Empty line, ignore + break; + case '=': // Command encountered + m_current_buffer = line + " "; // Replace end-of-line newline with space + m_mode = mode::command; + break; + case ' ': // fall-through + case '\t': // Verbatim encountered + // Note: Subsequent lines of verbatim don't have to be indented! + m_verbatim_lead_space = count_leading_whitespace(line); // For stripping leading spaces later on + m_current_buffer = line + "\n"; // Re-add missing end-of-line + m_mode = mode::verbatim; + break; + default: // Ordinary paragraph encountered + m_mode = mode::ordinary; + m_current_buffer = line + " "; // Replace end-of-line with space + break; + } + break; + } +} + +// Note: `ordinary' is already cleared from newlines. +void PodParser::parse_ordinary(std::string ordinary) +{ + m_tokens.push_back(new PodNodeParaStart()); + parse_inline(ordinary); + m_tokens.push_back(new PodNodeParaEnd()); +} + +// Note: `command' is already cleared from newlines. +void PodParser::parse_command(std::string command) +{ + // Parse command line into command and arguments using + // nasty magic because C++ has no "split string" function + // + std::istringstream iss(command.substr(1)); // 1 for skipping the leading "=" + std::vector arguments{std::istream_iterator{iss}, + std::istream_iterator{}}; + + std::string cmd = arguments[0]; + arguments.erase(arguments.begin()); + + // Execute the command + if (cmd == "head1") { + m_tokens.push_back(new PodNodeHeadStart(1, command.substr(cmd.length()+2))); + parse_inline(command.substr(cmd.length()+2)); + m_tokens.push_back(new PodNodeHeadEnd(1)); + } + else if (cmd == "head2") { + m_tokens.push_back(new PodNodeHeadStart(2, command.substr(cmd.length()+2))); + parse_inline(command.substr(cmd.length()+2)); + m_tokens.push_back(new PodNodeHeadEnd(2)); + } + else if (cmd == "head3") { + m_tokens.push_back(new PodNodeHeadStart(3, command.substr(cmd.length()+2))); + parse_inline(command.substr(cmd.length()+2)); + m_tokens.push_back(new PodNodeHeadEnd(3)); + } + else if (cmd == "head4") { + m_tokens.push_back(new PodNodeHeadStart(4, command.substr(cmd.length()+2))); + parse_inline(command.substr(cmd.length()+2)); + m_tokens.push_back(new PodNodeHeadEnd(4)); + } + else if (cmd == "pod") { + // This command is a no-op. It is only valid if found after a =cut command, + // which is directly handled in parse_line(). + } + else if (cmd == "cut") { + m_mode = mode::cut; + } + else if (cmd == "over") { + if (arguments.empty()) + m_tokens.push_back(new PodNodeOver()); + else + m_tokens.push_back(new PodNodeOver(std::stof(arguments[0]))); + } + else if (cmd == "item") { + // If there's a preceeding =item, close it (there's none at the beginning + // of a =over block). + PodNodeItemStart* p_preceeding_item = find_preceeding_item(); + if (p_preceeding_item) + m_tokens.push_back(new PodNodeItemEnd(p_preceeding_item->GetListType())); + + // If "=item" is not followed by *, 0-9 or [ (including not being + // followed by anything, i.e. bare), then it's a shorthand + // for "=item *". Normalise that. + if (arguments.empty()) { + arguments.push_back("*"); + } + else if (arguments[0][0] != '*' && arguments[0][0] != '[' && (arguments[0][0] < '0' || arguments[0][0] > '9')) { + arguments.insert(arguments.begin(), "*"); + } + + /* The first arguments gives the list type, any subsequent + * arguments form a paragraph inside the list. Thus, + * reconstruct the paragraph from the arguments list, parse + * it, and add it to the token list. Definition lists need + * special care as the definition term inside [] may contain + * spaces, thus the definition term spreads over multiple + * arguments. */ + if (arguments[0][0] == '[') { // Definition list + std::string dt; + for(auto iter=arguments.begin(); iter != arguments.end(); arguments.erase(iter)) { + dt += *iter; + if ((*iter).rfind(']') != std::string::npos) { + arguments.erase(iter); + break; + } + dt += " "; + } + + m_tokens.push_back(new PodNodeItemStart(dt)); + } + else { // Not a definition list + m_tokens.push_back(new PodNodeItemStart(arguments[0])); + arguments.erase(arguments.begin()); + } + + std::string para = join_vectorstr(arguments, " "); + m_tokens.push_back(new PodNodeParaStart()); + parse_inline(para); + m_tokens.push_back(new PodNodeParaEnd()); + } + else if (cmd == "back") { + OverListType list_type = OverListType::unordered; + + // If there's a preceeding =item, close it (there's none at the beginning + // of a =over block). + PodNodeItemStart* p_preceeding_item = find_preceeding_item(); + if (p_preceeding_item) { + m_tokens.push_back(new PodNodeItemEnd(p_preceeding_item->GetListType())); + list_type = p_preceeding_item->GetListType(); + + // Set the list type. The list type is set from the list's + // last item (only), but since all items need to be of the + // same time, this should rarely ever be a problem. + PodNodeOver* p_preceeding_over = find_preceeding_over(); + if (p_preceeding_over) { + p_preceeding_over->SetListType(list_type); + } + } + else { + std::cerr << "Warning on line " << m_lino << ": empty =over block" << std::endl; + } + + m_tokens.push_back(new PodNodeBack(list_type)); + } + else if (cmd == "begin") { + m_data_end_tag = std::string("=end ") + arguments[0]; + m_data_args = arguments; + m_mode = mode::data; + } // Note: "=end" is checked for in "data" mode in parse_line() + else if (cmd == "=for") { + if (arguments.empty()) { + std::cerr << "Warning on line " << m_lino << ": =for command lacks argument, ignoring" << std::endl; + return; + } + + std::string formatname = arguments[0]; + arguments.erase(arguments.begin()); + std::string content = join_vectorstr(arguments, " "); + + if (formatname[0] == ':') { // Colon means treat as normal paragraph + m_tokens.push_back(new PodNodeParaStart()); + parse_inline(content); + m_tokens.push_back(new PodNodeParaEnd()); + } + else { // Shorthand for =begin...=end + std::vector args; + args.push_back(formatname); + m_tokens.push_back(new PodNodeData(content, args)); + } + } + else if (cmd == "encoding") { + std::cerr << "Warning on line " << m_lino << ": the =encoding command is ignored, UTF-8 is assumed." << std::endl; + } + else { + std::cerr << "Warning on line " << m_lino << ": Ignoring unknown command '" << cmd << "'" << std::endl; + } +} + +void PodParser::parse_verbatim(std::string verbatim) +{ + // Strip leading white space + if (m_verbatim_lead_space > 0) { + std::stringstream ss(verbatim); + std::string line; + verbatim = ""; + while (std::getline(ss, line)) { + verbatim += line.substr(m_verbatim_lead_space) + "\n"; + } + } + + // Extend the previous verbatim node, if there is any + // (i.e. join subsequent verbatim lines). + PodNodeVerbatim* p_prev_verb = nullptr; + if (m_tokens.size() > 0) + p_prev_verb = dynamic_cast(m_tokens.back()); + if (p_prev_verb) { + p_prev_verb->AddText("\n"); + p_prev_verb->AddText(verbatim); + } + else + m_tokens.push_back(new PodNodeVerbatim(verbatim)); +} + +void PodParser::parse_data(std::string data) +{ + m_tokens.push_back(new PodNodeData(data, m_data_args)); +} + +// This function processes `para' as POD inline +// markup and returns the tokens for it. No surrounding +// elements (e.g. paragraph start and end) are included. +void PodParser::parse_inline(std::string para) +{ + struct markupel { + size_t angle_count; + mtype type; + }; + + std::stack inline_stack; + markupel mel; + for (size_t pos=0; pos < para.length(); pos++) { + if (para[pos+1] == '<') { // Start of inline markup + mel.angle_count = 0; + // Count angles + while (para[pos+1] == '<') { + mel.angle_count++; + pos++; + } + + if (is_inline_mode_active(mtype::zap)) { + std::cerr << "Warning on line " << m_lino << ": Z<> may not contain further formatting codes" << std::endl; + } + else if (is_inline_mode_active(mtype::escape)) { + std::cerr << "Warning on line " << m_lino << ": E<> may not contain further formatting codes" << std::endl; + } + else if (is_inline_mode_active(mtype::index)) { + std::cerr << "Warning on line " << m_lino << ": X<> may not contain further formatting codes" << std::endl; + } + else if (m_link_bar_found) { + std::cerr << "Warning on line " << m_lino << ": L<>'s link target may not contain formatting codes" << std::endl; + } + + mel.type = mtype::none; + switch (para[pos-mel.angle_count]) { + case 'I': + mel.type = mtype::italic; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'B': + mel.type = mtype::bold; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'C': + mel.type = mtype::code; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'F': + mel.type = mtype::filename; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'X': + mel.type = mtype::index; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'Z': + mel.type = mtype::zap; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'L': + mel.type = mtype::link; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'E': + mel.type = mtype::escape; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + case 'S': + mel.type = mtype::nbsp; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + default: + std::cerr << "Warning on line " << m_lino << ": Ignoring unknown formatting code '" << para[pos] << "'" << std::endl; + mel.type = mtype::none; + m_tokens.push_back(new PodNodeInlineMarkupStart(mel.type)); + break; + } + + // Strip leading spaces + while (para[pos+1] == ' ') + pos++; + + inline_stack.push(mel); + } + else if (inline_stack.size() > 0 && para[pos] == '>') { // End of inline markup + mel = inline_stack.top(); + std::string angles(mel.angle_count, '>'); + + // Retrieve preceeding inline text, if there's any (there's none + // immediately following an opening markup token). + PodNodeInlineText* p_prectext = dynamic_cast(m_tokens.back()); + + // Check if this is a valid markup close or just stray angle brackets + if (para.substr(pos, mel.angle_count) == angles) { // Valid + inline_stack.pop(); + pos += mel.angle_count - 1; // pos is increased by loop statement by 1 again + + // Strip trailing whitespace of preceeding text + if (p_prectext) + p_prectext->StripTrailingWhitespace(); + + // Insert End marker + switch (mel.type) { + case mtype::escape: + m_tokens.push_back(new PodNodeInlineMarkupEnd(mel.type, {m_ecode})); + m_ecode.clear(); // E<> may not nest + break; + case mtype::index: { + std::string target(m_idx_kw); + std::replace(target.begin(), target.end(), ' ', '_'); + + m_tokens.push_back(new PodNodeInlineMarkupEnd(mel.type, {target})); + m_idx_keywords[m_idx_kw] = target; + m_idx_kw.clear(); } // X<> may not nest + break; + case mtype::link: { + PodNodeInlineMarkupStart* p_lstart = find_preceeding_inline_markup_start(mtype::link); + p_lstart->AddArgument(m_link_content); + p_lstart->SetFilenameCallback(m_filename_cb); + p_lstart->SetMethodnameCallback(m_mname_cb); + + m_tokens.push_back(new PodNodeInlineMarkupEnd(mel.type)); + m_link_bar_found = false; + m_link_content.clear(); } // L<> may not nest + break; + default: + m_tokens.push_back(new PodNodeInlineMarkupEnd(mel.type)); + break; + } + } + else { // Stray angle brackets + // Not enough closing angles. Insert as plain text. + // Append to last text node if exists, otherwise + // make a new text node. + std::string s(para.substr(pos, 1)); + html_escape(s); + if (p_prectext) + p_prectext->AddText(s); + else + m_tokens.push_back(new PodNodeInlineText(s)); + + // Same as below for normal actual text + if (is_inline_mode_active(mtype::link)) { + m_link_content += para.substr(pos, 1); + } + } + } + else { // No inline markup: plain text + if (is_inline_mode_active(mtype::escape)) { // Escape code + m_ecode += para.substr(pos, 1); + } + else if (is_inline_mode_active(mtype::index)) { // Index code + m_idx_kw += para.substr(pos, 1); + } + else { // Actual text + /* L<> content handling; the parser needs the entire + * link's content later on in PodNodeInlineMarkup::ToHTML(). + * But, if a bar | is found, this terminates the link's + * visible text, separating it from the target. The following + * code makes it impossible to use | inside the link text, + * even inside another formatting code, but this is deemed + * rare enough to ignore the condition. Finally, using + * any kind of formatting markup in the link *target* is + * unsupported (this is a deviation from canonical POD markup). */ + if (is_inline_mode_active(mtype::link)) { + m_link_content += para.substr(pos, 1); + + if (para[pos] == '|') { + m_link_bar_found = true; + } + } + if (m_link_bar_found) // Visible link text has ended + continue; + + // Append to last text node if exists, otherwise + // make a new text node. + PodNodeInlineText* p_prectext = dynamic_cast(m_tokens.back()); + std::string s(para.substr(pos, 1)); + html_escape(s, is_inline_mode_active(mtype::nbsp)); + if (p_prectext) + p_prectext->AddText(s); + else + m_tokens.push_back(new PodNodeInlineText(s)); + } + } + } + + // Handle Z<> formatting codes + zap_tokens(); +} + +// Finds the preceeding =item on the same =over level. +// If there is none, returns nullptr. +PodNodeItemStart* PodParser::find_preceeding_item() { + PodNodeItemStart* p_item = nullptr; + int level = 0; + + for(auto iter=m_tokens.rbegin(); iter != m_tokens.rend(); iter++) { + if (dynamic_cast(*iter)) + level++; + else if (dynamic_cast(*iter)) { + if (level == 0) // Terminate search if enclosing =over is found + break; + else + level--; + } + else if (level == 0 && (p_item = dynamic_cast(*iter))) + return p_item; // ^ Single "=" intended + } + + return nullptr; // No preceeding =item on the same level +} + +// Finds the =over that corresponds to the current indent level. +// If there is none (i.e. currently outside =over block), +// returns nullptr. +PodNodeOver* PodParser::find_preceeding_over() { + PodNodeOver* p_over = nullptr; + int level = 0; + + for(auto iter=m_tokens.rbegin(); iter != m_tokens.rend(); iter++) { + if (dynamic_cast(*iter)) { + level++; + } + else if ((p_over = dynamic_cast(*iter))) { // Single = intended + if (level == 0) { + return p_over; + } + else { + level--; + } + } + } + + return nullptr; // Not inside an =over block +} + +// Assumes an open formatting code and finds the PodNodeInlineMarkupStart* +// that opened it, returning it. If `t' is mtype::none, any +// opening PodNodeInlineMarkupStart* suffices, otherwise the +// search is restricted to those of type `t'. +PodNodeInlineMarkupStart* PodParser::find_preceeding_inline_markup_start(mtype t) +{ + PodNodeInlineMarkupStart* p_mstart = nullptr; + int level = 0; + + for (auto iter=m_tokens.rbegin(); iter != m_tokens.rend(); iter++) { + if (dynamic_cast(*iter)) { + level++; + } + else if (level > 0 && dynamic_cast(*iter)) { + level--; + } + else if (level == 0 && (p_mstart = dynamic_cast(*iter))) { + if (t == mtype::none) + return p_mstart; + else if (p_mstart->GetMtype() == t) + return p_mstart; + } + } + + throw(std::runtime_error("Bug: Impossible condition reached: no preceeding inline markup start found")); +} + +// Checks if the parser at the current point is inside an opened +// formatting code of type `t'. This function takes care of nesting +// for all modes, even though nesting is not useful for all modes +// (notably mtype::nbsp). +bool PodParser::is_inline_mode_active(mtype t) +{ + PodNodeInlineMarkupEnd* p_mend = nullptr; + PodNodeInlineMarkupStart* p_mstart = nullptr; + int level = 0; + + for(auto iter=m_tokens.rbegin(); iter != m_tokens.rend(); iter++) { + if ((p_mend = dynamic_cast(*iter))) { // Single = intended + if (p_mend->GetMtype() == t) { + level--; + } + } + else if ((p_mstart = dynamic_cast(*iter))) { // Single = intended + if (p_mstart->GetMtype() == t) { + level++; + } + } + } + + return level > 0; +} + +// Evaluate the Z<> formatting code. This function erases from +// m_tokens everything between a PodNodeInlineMarkupStart of type +// mtype::zap and the corresponding PodNodeInlineMarkupEnd. If in a +// paragraph, heading, or item no PodNodeInlineMarkupEnd is found, the +// block's ending terminates zap mode (this caters for missing closing +// ">"). +void PodParser::zap_tokens() +{ + PodNodeInlineMarkupEnd* p_mend = nullptr; + PodNodeInlineMarkupStart* p_mstart = nullptr; + bool erase = false; + int level = 0; + + for(auto iter=m_tokens.begin(); iter != m_tokens.end(); iter++) { + // Always terminate Z<> mode if the end of the current + // block is reached while Z mode is active (i.e. missing + // closing ">"). + if ((level > 0) && (dynamic_cast(*iter) || + dynamic_cast(*iter) || + dynamic_cast(*iter))) { + level = 0; + continue; + } + + // Check for zap mode formatting codes + if ((p_mstart = dynamic_cast(*iter))) { // Single = intended + if (p_mstart->GetMtype() == mtype::zap) { + if (level > 0) { + erase = true; + } + + level++; + } + } + else if ((p_mend = dynamic_cast(*iter))) { // Single = intended + if (p_mend->GetMtype() == mtype::zap) { + level--; + + if (level > 0) { + erase = true; + } + } + } + else if (level > 0) { + erase = true; + } + + // If inside zap mode, erase token. + if (erase) { + erase = false; + iter = m_tokens.erase(iter); + if (iter == m_tokens.end()) + break; + else + iter--; + } + } +} + +/** + * Processes `title' so that it can be used for an HTML A tag's + * NAME attribute. The result is returned. + */ +std::string PodParser::MakeHeadingAnchorName(const std::string& title) +{ + std::string result; + for (size_t i=0; i < title.length(); i++) { + if (title[i] >= '0' && title[i] <= '9') + result += title[i]; + else if (title[i] >= 'A' && title[i] <= 'Z') + result += title[i]; + else if (title[i] >= 'a' && title[i] <= 'z') + result += title[i]; + else + result += '-'; + } + return result; +} + +/*************************************** + * Pod nodes + **************************************/ + +PodNodeHeadStart::PodNodeHeadStart(int level, std::string content) + : m_level(level), + m_content(content) +{ +} + +std::string PodNodeHeadStart::ToHTML() const +{ + return std::string(""); +} + +PodNodeHeadEnd::PodNodeHeadEnd(int level) + : m_level(level) +{ +} + +std::string PodNodeHeadEnd::ToHTML() const +{ + return std::string("\n"); +} + +PodNodeOver::PodNodeOver(float indent) + : m_indent(indent), + m_list_type(OverListType::unordered) +{ +} + +void PodNodeOver::SetListType(OverListType t) +{ + m_list_type = t; +} + +std::string PodNodeOver::ToHTML() const +{ + switch (m_list_type) { + case OverListType::unordered: + return "
    "; + case OverListType::ordered: + return "
      "; + case OverListType::description: + return "
      "; + } // No default -- all OverListType values are handled + + throw(std::runtime_error("This should never be reached")); +} + +/* Construct a new list item start. The list type is determined + * from the label: if it is a "*", then it's an unordered list, + * if it's a stringified number it's an ordered list, and if + * it's anything else then it's a description list. For description + * list items, the label is actually printed in the
      element on + * HTML output via ToHTML(). */ +PodNodeItemStart::PodNodeItemStart(std::string label) + : m_label(label) +{ + if (m_label[0] == '*') + m_list_type = OverListType::unordered; + else if (m_label[0] >= '0' && m_label[0] <= '9') + m_list_type = OverListType::ordered; + else + m_list_type = OverListType::description; +} + +const std::string& PodNodeItemStart::GetLabel() const +{ + return m_label; +} + +OverListType PodNodeItemStart::GetListType() const +{ + return m_list_type; +} + +std::string PodNodeItemStart::ToHTML() const +{ + switch (m_list_type) { + case OverListType::unordered: + case OverListType::ordered: // fall-through + return "
    1. "; + case OverListType::description: + return std::string("
      ") + m_label.substr(1, m_label.length() - 2) + "
      "; + } // No default -- all overListType values are handled + + throw(std::string("This should never be reached")); +} + +PodNodeItemEnd::PodNodeItemEnd(OverListType t) + : m_list_type(t) +{ +} + +std::string PodNodeItemEnd::ToHTML() const +{ + if (m_list_type == OverListType::description) + return "
      "; + else + return "
    2. "; +} + +PodNodeBack::PodNodeBack(OverListType t) + : m_list_type(t) +{ +} + +std::string PodNodeBack::ToHTML() const +{ + switch (m_list_type) { + case OverListType::unordered: + return "
\n"; + case OverListType::ordered: + return "\n"; + case OverListType::description: + return "\n"; + } // No default -- all OverListType values are handled + + throw(std::runtime_error("This should never be reached")); +} + +std::string PodNodeParaStart::ToHTML() const +{ + return "

"; +} + +std::string PodNodeParaEnd::ToHTML() const +{ + return "

\n"; +} + +PodNodeInlineText::PodNodeInlineText(std::string text) + : m_text(text) +{ +} + +PodNodeInlineText::PodNodeInlineText(char ch) + : m_text(1, ch) +{ +} + +void PodNodeInlineText::AddText(const std::string& text) { + m_text += text; +} + +void PodNodeInlineText::AddText(char ch) { + m_text += std::string(1, ch); +} + +void PodNodeInlineText::StripTrailingWhitespace() { + while (m_text[m_text.length()-1] == ' ') { + m_text = m_text.substr(0, m_text.length() - 1); + } +} + +std::string PodNodeInlineText::ToHTML() const +{ + return m_text; +} + +PodNodeInlineMarkupStart::PodNodeInlineMarkupStart(mtype type, std::initializer_list args) + : m_mtype(type), + m_args(args), + m_filename_cb(nullptr), + m_mname_cb(nullptr) +{ +} + +// If for whatever reason the PodNodeInlineMarkupStart cannot be constructed +// directly with the argument list, use this function to inject the arguments +// later on. +void PodNodeInlineMarkupStart::AddArgument(const std::string& arg) +{ + m_args.push_back(arg); +} + +// Set the filename calculation callback used for calculating L<> internal +// link targets. +void PodNodeInlineMarkupStart::SetFilenameCallback(std::string (*cb)(std::string)) +{ + m_filename_cb = cb; +} + +// Set the method name ID calculation callback used for calculating L<> internal +// link targets. +void PodNodeInlineMarkupStart::SetMethodnameCallback(std::string (*cb)(bool, std::string)) +{ + m_mname_cb = cb; +} + +std::string PodNodeInlineMarkupStart::ToHTML() const +{ + size_t pos = std::string::npos; + std::string link_target; + + switch (m_mtype) { + case mtype::none: + case mtype::nbsp: // fall-through + case mtype::zap: // fall-through + case mtype::escape: // fall-through + case mtype::index: // fall-through + return ""; + case mtype::italic: + return ""; + case mtype::bold: + return ""; + case mtype::code: + return ""; + case mtype::filename: + return ""; + case mtype::link: + if ((pos = m_args[0].find('|')) != std::string::npos) // Single = intended + link_target = m_args[0].substr(pos+1); + else // Implicit link target + link_target = m_args[0]; + + if (link_target.find('<') != std::string::npos) { + std::cerr << "Warning: Use of formatting codes inside link target '" << link_target << "' is unsupported (deviation from canonical POD syntax)" << std::endl; + } + + if (link_target.find("://") == std::string::npos) { // Target is no url + // Check if UNIX man(1) page. (= special kind of external link) + std::string manpage; + std::string section; + if (check_manpage(link_target, manpage, section)) { // It's a manpage. + return std::string(""; + } + /* It's a link to something in the docs itself (= internal link) + * There are two kind of these: + * 1. Thing/section, with /section being optional (meaning a heading) + * 2. Thing#method or Thing::method, with #method and ::method being optional + * This is an extension over canonical POD markup. + * That is, "Thing" alone is ambiguous. But as it evaluates + * to the same target (`classmodname' below), this is not + * relevant. It's processed via variant 1. */ + if (((pos = link_target.find("#")) != std::string::npos) || + ((pos = link_target.find("::")) != std::string::npos)) { // Variant 2 + bool is_cmethod = link_target[pos] == ':'; + std::string classmodname = link_target.substr(0, pos); + std::string methodname = link_target.substr(is_cmethod ? pos+2 : pos+1); + + if (classmodname.empty()) { // Link to method doc in thid document + return std::string(""; + } + else { // Link to method doc in different document + return std::string(""; + } + } + else { // Variant 1 + // Split class/module name off section link at the slash, if present. + std::string classmodname; + if ((pos = link_target.find("/")) != std::string::npos) { + classmodname = link_target.substr(0, pos); + section = link_target.substr(pos+1); + } + else + classmodname = link_target; + + if (classmodname.empty()) { // Means link to section in current document + if (section.empty()) + std::cerr << "Warning: empty link target" << std::endl; + + return std::string(""; + } + else { // Means link to different document + if (section.empty()) { + return std::string(""; + } + else { + return std::string(""; + } + } + } + } + else { // Target is url (= external link) + return std::string(""; + } + } + + throw(std::runtime_error("This should never be reached")); +} + +PodNodeInlineMarkupEnd::PodNodeInlineMarkupEnd(mtype type, std::initializer_list args) + : m_mtype(type), + m_args(args) +{ +} + +std::string PodNodeInlineMarkupEnd::ToHTML() const +{ + switch (m_mtype) { + case mtype::none: + case mtype::nbsp: // fall-through + case mtype::zap: // fall-through + return ""; + case mtype::italic: + return ""; + case mtype::bold: + return ""; + case mtype::code: + return ""; + case mtype::filename: + return ""; + case mtype::link: + return ""; + case mtype::escape: + if (m_args[0] == "verbar") + return "|"; + else if (m_args[0] == "sol") + return "/"; + else if (m_args[0] == "lchevron") + return "«"; + else if (m_args[0] == "rchevron") + return "»"; + else // FIXME: Check if args[0] is actually a valid escape code + return std::string("&") + m_args[0] + ";"; + case mtype::index: + return std::string(""; + } + + throw(std::runtime_error("This should never be reached")); +} + +PodNodeData::PodNodeData(std::string data, std::vector arguments) + : m_data(data), + m_arguments(arguments) +{ +} + +std::string PodNodeData::ToHTML() const +{ + if (m_arguments[0] == "html") + return m_data; + else + return ""; +} + +PodNodeVerbatim::PodNodeVerbatim(std::string text) + : m_text(text) +{ +} + +void PodNodeVerbatim::AddText(std::string text) +{ + m_text += text; +} + +std::string PodNodeVerbatim::ToHTML() const +{ + return std::string("
") + m_text + std::string("
\n"); +} + +/*************************************** + * Formatter + **************************************/ + +std::string Pod::FormatHTML(const std::vector& tokens) +{ + std::string result; + + for (const PodNode* p_node: tokens) { + result += p_node->ToHTML(); + } + + return result; +} + +/*************************************** + * Helpers + **************************************/ + +size_t Pod::count_leading_whitespace(const std::string& str) +{ + size_t count = 0; + while (str[count] == ' ' || str[count] == '\t') + count++; + return count; +} + +std::string Pod::join_vectorstr(const std::vector& vec, const std::string& separator) +{ + std::string result; + for(size_t i=0; i < vec.size(); i++) { + if (i > 0) + result += separator; + + result += vec[i]; + } + + return result; +} + +void Pod::html_escape(std::string& str, bool nbsp) +{ + size_t pos = 0; + while ((pos = str.find("&")) != std::string::npos && str.substr(pos, 5) != "&") + str.replace(pos, 1, "&"); + while ((pos = str.find("<")) != std::string::npos) + str.replace(pos, 1, "<"); + while ((pos = str.find(">")) != std::string::npos) + str.replace(pos, 1, ">"); + if (nbsp) + while ((pos = str.find(" ")) != std::string::npos) + str.replace(pos, 1, " "); +} + +bool Pod::check_manpage(const std::string& target, std::string& manpage, std::string& section) +{ + if ((target.find(' ') == std::string::npos) && + (target.rfind(")") == target.size() - 1) && + (target.rfind("(") == target.size() - 3) && + (target[target.size() - 2] >= '0') && + (target[target.size() - 2] <= '9')) + { + manpage = target.substr(0, target.rfind("(")); + section = target.substr(target.size() - 2, 1); + return true; + } + return false; +} diff --git a/pod.hpp b/pod.hpp new file mode 100644 index 0000000..665174a --- /dev/null +++ b/pod.hpp @@ -0,0 +1,283 @@ +/* POD format parser written in C++. + * + * Copyright © 2019 Marvin Gülker + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef POD_HPP +#include +#include +#include +#include + +#define POD_HPP +/* These classes implement the Perl POD documentation format: + * - https://perldoc.perl.org/perlpod.html + * - https://perldoc.perl.org/perlpodspec.html + */ + +namespace Pod { + +class PodNode +{ +public: + PodNode() {}; + virtual ~PodNode() {}; + virtual std::string ToHTML() const = 0; +}; + +class PodNodeHeadStart: public PodNode +{ +public: + PodNodeHeadStart(int level, std::string content); // content is for ID generation + virtual std::string ToHTML() const; +private: + int m_level; + std::string m_content; +}; + +class PodNodeHeadEnd: public PodNode +{ +public: + PodNodeHeadEnd(int level); + virtual std::string ToHTML() const; +private: + int m_level; +}; + +enum class OverListType { + unordered, + ordered, + description +}; + +class PodNodeOver: public PodNode +{ +public: + PodNodeOver(float indent = 4.0f); + virtual std::string ToHTML() const; + void SetListType(OverListType t); +private: + float m_indent; + OverListType m_list_type; +}; + +class PodNodeItemStart: public PodNode +{ +public: + PodNodeItemStart(std::string label); + virtual std::string ToHTML() const; + const std::string& GetLabel() const; + OverListType GetListType() const; +private: + std::string m_label; + OverListType m_list_type; +}; + +class PodNodeItemEnd: public PodNode +{ +public: + PodNodeItemEnd(OverListType t); + virtual std::string ToHTML() const; +private: + OverListType m_list_type; +}; + +class PodNodeBack: public PodNode +{ +public: + PodNodeBack(OverListType t); + virtual std::string ToHTML() const; +private: + OverListType m_list_type; +}; + +class PodNodeParaStart: public PodNode +{ + virtual std::string ToHTML() const; +}; + +class PodNodeParaEnd: public PodNode +{ + virtual std::string ToHTML() const; +}; + +enum class mtype { + none, + italic, + bold, + code, + filename, + nbsp, + zap, + escape, + index, + link +}; + +class PodNodeInlineMarkupStart: public PodNode +{ +public: + PodNodeInlineMarkupStart(mtype type, std::initializer_list args = {}); + virtual std::string ToHTML() const; + inline mtype GetMtype() const { return m_mtype; }; + + // These three are only used for mtype::link: + void AddArgument(const std::string& arg); + void SetFilenameCallback(std::string (*cb)(std::string)); + void SetMethodnameCallback(std::string (*cb)(bool, std::string)); +private: + mtype m_mtype; + std::vector m_args; + std::string (*m_filename_cb)(std::string); + std::string (*m_mname_cb)(bool, std::string); +}; + +class PodNodeInlineMarkupEnd: public PodNode +{ +public: + PodNodeInlineMarkupEnd(mtype type, std::initializer_list args = {}); + virtual std::string ToHTML() const; + inline mtype GetMtype() const { return m_mtype; }; +private: + mtype m_mtype; + std::vector m_args; +}; + +// This node class is for the downmost-possible unit, i.e. the actual text. +class PodNodeInlineText: public PodNode +{ +public: + PodNodeInlineText(std::string text); + PodNodeInlineText(char ch); + virtual std::string ToHTML() const; + void AddText(const std::string& text); + void AddText(char ch); + void StripTrailingWhitespace(); +private: + std::string m_text; +}; + +class PodNodeData: public PodNode +{ +public: + PodNodeData(std::string data, std::vector arguments); + virtual std::string ToHTML() const; +private: + std::string m_data; + std::vector m_arguments; +}; + +class PodNodeVerbatim: public PodNode +{ +public: + PodNodeVerbatim(std::string text); + void AddText(std::string text); + virtual std::string ToHTML() const; +private: + std::string m_text; +}; + +class PodParser +{ +public: + PodParser(const std::string& str, + std::string (*fcb)(std::string), + std::string (*mcb)(bool, std::string)); + ~PodParser(); + + void Reset(const std::string& str); + void Parse(); + inline const std::vector& GetTokens() { return m_tokens; }; + // Returns the found X<> index entries as a map of form: + // "index heading" => "insert_anchor_name" + inline const std::map GetIndexEntries() const { return m_idx_keywords; } + + static std::string MakeHeadingAnchorName(const std::string& title); +private: + void parse_line(const std::string& line); + void parse_command(std::string command); + void parse_ordinary(std::string ordinary); + void parse_verbatim(std::string verbatim); + void parse_data(std::string data); + void parse_inline(std::string para); + PodNodeItemStart* find_preceeding_item(); + PodNodeOver* find_preceeding_over(); + PodNodeInlineMarkupStart* find_preceeding_inline_markup_start(mtype t); + bool is_inline_mode_active(mtype t); + void zap_tokens(); + + enum class mode { + none, + command, + verbatim, + ordinary, + data, + cut + }; + + long m_lino; + mode m_mode; + bool m_link_bar_found; + std::string m_source_markup; + std::string (*m_filename_cb)(std::string); + std::string (*m_mname_cb)(bool, std::string); + size_t m_verbatim_lead_space; + std::vector m_tokens; + std::string m_current_buffer; + std::string m_data_end_tag; + std::vector m_data_args; + std::map m_idx_keywords; + std::string m_ecode; + std::string m_idx_kw; + std::string m_link_content; +}; + +/// A function that calls ToHTML() on each token in `tokens', +/// acculumates the results and returns them as one string. +std::string FormatHTML(const std::vector& tokens); + +// Counts the leading spaces and tabs in +str+. +size_t count_leading_whitespace(const std::string& str); +// Joins all the strings in `vec' into one string separated by `separator'. +std::string join_vectorstr(const std::vector& vec, const std::string& separator); +// Mask all occurences of &, <, and >. If `nbsp' is +// true, masks spaces as " ". +void html_escape(std::string& str, bool nbsp = false); +/* Checks if `target' is a UNIX man(1) page. Rule: If no spaces and a + * digit in parentheses and the end, it's a manpage. If `target' is + * found to be a manpage, true is returned, and `manpage' is set to + * the man page's linkable name and `section' to the section the + * manpage resides in. Otherwise, false is returned and `manpage' + * and `section' are left untouched. + * + * FIXME: Ignores manpages with unusual letter sections (e.g. 3p) */ +bool check_manpage(const std::string& target, std::string& manpage, std::string& section); + +} + +#endif /* POD_HPP */