diff --git a/.spec-data/languages.json b/.spec-data/languages.json
index a6cfa85132..df28231024 100644
--- a/.spec-data/languages.json
+++ b/.spec-data/languages.json
@@ -4,5 +4,6 @@
"es": {"name": "Spanish", "native-name": "Español"},
"fr": {"name": "French", "native-name": "Français"},
"ja": {"name": "Japanese", "native-name": "日本語"},
- "zh-cn": {"name": "Simplified Chinese", "native-name": "简体中文"}
+ "zh-cn": {"name": "Simplified Chinese", "native-name": "简体中文"},
+ "Ca":{"name": "Cambodia", "native-name": "Khmer"}
}
diff --git a/styles/styles-html.css b/styles/styles-html.css
index 50c2bdfe04..4f83481f98 100644
--- a/styles/styles-html.css
+++ b/styles/styles-html.css
@@ -62,6 +62,14 @@ td.eg { border-width: thin; text-align: center; }
#named-character-references-table > table > tbody > tr#entity-CounterClockwiseContourIntegral > td:first-child { font-size: 0.5em; }
.glyph.control { color: red; }
+@text-decoration {
+ text-align: center;
+ font-size: 2em;
+ font-display: auto;
+ list-style: circle;
+ line-height: inherit;
+ text-decoration: line-through;
+}
@font-face {
font-family: 'Essays1743';
diff --git a/tools/.gitignore b/tools/.gitignore
new file mode 100644
index 0000000000..f9e8ce6f8a
--- /dev/null
+++ b/tools/.gitignore
@@ -0,0 +1,3 @@
+single-page.html
+node_modules
+out
\ No newline at end of file
diff --git a/tools/README.md b/tools/README.md
new file mode 100644
index 0000000000..c88f7144c3
--- /dev/null
+++ b/tools/README.md
@@ -0,0 +1,17 @@
+Tools used to publish the HTML WG drafts
+========================================
+
+This repository contains to generate the multipage HTML specification from a single page.
+
+Installation
+============
+
+check out, npm install
+
+Running
+=======
+
+You can run this as:
+
+ node multipage.js [path of the spec] [output directory]
+
diff --git a/tools/linkdiff/README.md b/tools/linkdiff/README.md
new file mode 100644
index 0000000000..99239d1406
--- /dev/null
+++ b/tools/linkdiff/README.md
@@ -0,0 +1,197 @@
+linkdiff - Link Correctness Comparison Tool
+===========================================
+
+Abstract
+============
+
+One of the common, difficult to figure-out problems in the current HTML spec is whether links are
+"correct". Not "correct" as in syntax or as opposite to broken links, but rather that the link in
+question goes to the semantically correct place in the spec or other linked spec. Correctness, in
+this sense, can only be determined by comparing the links to a canonical "correct" source. In the
+case of the W3C HTML spec, the source used for determining correctness in the WHATWG version of the
+spec.
+
+Usage
+======
+
+```
+python ./linkdiff https://html.spec.whatwg.org/ https://w3c.github.io/html/single-page.html > link_report.json
+```
+
+Approach
+=======
+The approach taken by this project is to validate two things: first that a given **origin link**
+(the `` itself) can be compared properly between the W3C and the WHATWG specs. Since one or
+both specs may have links that the other doesn't, all links to be tested must first be checked to
+see that they are essentially "the same" link. If they are not the same, then no checking for
+correctness is necessary (and they can be flagged for follow-up). Once two origin links are
+determined to be the same, then their respective **link targets** (the place where clicking the link
+would go) can be checked. Checking link targets will use the same technique as validating the two
+origin links are the same.
+
+Details for Same-ness check
+=========================
+
+Due to possible small structure and prose differences in the W3C and WHATWG specs, to validate that
+either the source or target of a link is the same relative place, a statistical approach is used.
+Structure of the surrounding document is ignored, and some amount of textual content is extracted.
+This text is then lexically compared to generate a ratio of alignment (used for diffing). Where
+there is a high percentage of alignment between the tokens (given a statistically-sound
+sample size), the links can be considered the same. The target of the links is then located in the
+document and compared in like manner.
+
+Links matching
+================
+
+The tool assumes that one link in the **baseline doc** should have a corresponding link (and only
+one) in the **source doc**. The link matching algorithm will attempt to match up every link in the
+baseline doc to exactly one match in the source doc and vice-versa. To avoid potential `O(n^2)`
+runtime for matching up links (especially where n is very large), the matching algorithm uses
+an index and selects the best candidate match from among the entire set of possible links. Multiple
+matches above the given threshold are possible, and all such matched are saved in the matching phase.
+Following the matching phase, duplicate matches are resolved taking all potential candidates (from
+the baseline and source documents) into account and selecing the match with the highest ratio. In case
+of a tie, the first match in document order is chosen.
+
+Input
+=======
+
+In order to avoid hard-coding the percent-alignment necessary to constitute "sameness" the program may
+takes as input a floating-point number between 0 and 1, representing the percentage threshold to use
+when determining if two links are the same. The default value is **0.8** (80% similar).
+
+An "ignore list" file may be provided for links, if encountered, to be skipped during processing.
+The file contents should be JSON formatted using the following syntax. The default value is to **not
+use an ignore list**.
+
+```json
+{
+ "ignoreList": [
+ "full_URL_here_including_hash_or_query_param",
+ "additional_URLs_here"
+ ]
+}
+```
+
+A "visual diff" flag may be provided, which causes the tool to additionally output two files: a
+`visual_baseline.html` and `visual_source.html`. These files will contain a visual output of the
+link diff tool: gray highlighted links are not found, red-ish links are matched but not correct,
+while green links are matched and correct.
+
+Output
+========
+
+The tool returns a report of the links checked in two given documents (the baseline document
+compared to the test document), as well as the links skipped. The report is in JSON and has the
+following format:
+
+```json
+{
+ "ratioThreshold": #.#,
+ "matchingLinksTotal": ##,
+ "correctLinksTotal": ##,
+ "potentialMatchingLinksSetSize": ##,
+ "percentMatched": #.#,
+ "percentCorrect": #.#,
+ "baselineDoc": {
+ "linksTotal": ###,
+ "nonMatchedTotal": ##,
+ "linkIndex": [
+ {
+ "index": ##,
+ "status": "",
+ "href": "",
+ "matchIndex": ##,
+ "matchRatio": #.#,
+ "correctRatio": #.#,
+ "lineNo": ##
+ }
+ ]
+ },
+ "sourceDoc": {
+ ...
+ }
+}
+```
+
+* `ratioThreshold` - reflects the value used to determine the threshold above which the links are
+ considered matching or correct.
+* `matchingLinksTotal` - the number of origin links which were both found to be the same link in
+ both the baseline doc and the source doc.
+* `correctLinksTotal` - the number of origin links which were both found to be the same link in
+ both the baseline doc and the source doc **and** whose link targets were also found to be the
+ same.
+* `potentialMatchingLinksSetSize` - the minimum of the total link count from the baseline doc and
+ the source doc -- the upper bound on the number of potential possible matches assuming every
+ link could be matched between the baseline and source docs.
+* `percentMatched` - the percentage as a value between 0 and 1 of `matchingLinksTotal` divided by
+ the `potentialMatchingLinksSetSize`.
+* `percentCorrect` - the percentage as a value between 0 and 1 of `correctLinksTotal` divided by
+ the `potentialMatchingLinksSetSize`.
+* `baselineDoc` and `sourceDoc` have the same structure:
+ * `linksTotal` - the total number of links found in the respective document
+ * `nonMatchedTotal` - the `linksTotal` less `matchingLinksTotal` number. How many of the total
+ links were not matched at all.
+ * `linkIndex` - an array of link objects--one entry for each link found in the document
+
+A link object has:
+
+* `index` - the ordinal index of this link in document order.
+* `status` - a string, one of:
+ * "non-matched" - the link wasn't matched up with any other link in the other document. For
+ the statistics, this increments the `nonMatchedTotal` value.
+ * "matched" - the link was only matched in the other document (but the link target was not
+ matched, e.g., not "correct"). For the statistics, this increments the `matchingLinksTotal`
+ value.
+ * "correct" - the link was matched and the respective links in both baseline and source docs
+ refer to the same relative place when followed. For the statistics, this increments the
+ `correctLinksTotal` value.
+ * "skipped" - the link was skipped because it was one of the links present on the ignore list
+ provided as input. For the statistics, this decrements the pool of potential links for this
+ document, which contributes to the `potentialMatchingLinksSetSize`.
+ * "broken" - the link was non-external yet, didn't resolve to anywhere in the document. For
+ statistics, this increments no values (it's in the `potentialMatchingLinksSetSize`, but not
+ a part of the `correctLinksTotal`--same bucket as the `matchingLinksTotal`).
+ * "non-matched-external" - the link wasn't matched up with any other link in the other
+ document, and the link's href refers to an external location (its value is not checked).
+ For the statistics, this increments the `nonMatchedTotal` value.
+ * "matched-external" - the link was matched up in the other document, but the URLs are not
+ 100% the same. For the statistics, this increments the `matchingLinksTotal` value.
+ * "correct-external" - the link was matched up in the other document and it's URL is 100%
+ the same--both specs external link targets would end-up in exactly the same place. For the
+ statistics, this increments the `correctLinksTotal` value.
+* `href` - the original value of the link's href attribute (to help with locating this specific
+ link in the source document.
+* `matchIndex` - if the status value is "matched", "correct", "matched-external" or
+ "correct-external" (considered **matching statuses**), this is the index to the matching link
+ in the other document (if the link object is in the `baselineDoc`'s list of `linkIndexes`,
+ then this link object's `matchIndex` is a reference to a link object with matching index in
+ the `sourceDoc`'s list of `linkIndexes`). Otherwise the `matchIndex` will be the "closest"
+ (best) match that could be located among all potential candidates. If all candidates were
+ equally comparable (e.g., had the same `matchRatio`) then this is the last candidate checked.
+ The default value is -1.
+* `matchRatio` - the value from 0 to 1 used to determine that this was a match (the first match
+ found for any matching statuses), or if the status is not one of the matching statuses, the
+ best ratio obtained from the best possible match of all the links tested. The default value is
+ 0.
+* `correctRatio` - the value from 0 to 1 used to determine that the target of the link was correct
+ (or not) for the given matched pair. The default value is 0.
+* `lineNo` - the line number in the source where the link was encountered. If the "visual diff" flag
+ is provided, the line number will refer to the line numbers in the output `visual_baseline.html`
+ and `visual_source.html` files (which have a different offset due to injected CSS and JavaScript).
+
+External Links
+==============
+
+The algorithm only follows and validates "internal" links (to make sure they land in the appropriate
+section of the spec. For external links, the algorithm attempts to match up the origin links, and
+only considers the actual HREF value as a string literal to see if the HREF would go to the same
+place if clicked. If there are any differences (e.g., the string is not 100% the same), then the
+external link is not considered "correct".
+
+Dependencies
+==============
+
+Uses the built-in Python library [difflib](https://docs.python.org/2/library/difflib.html) for an
+implementation of token matching and ratio of sameness calculations. It also uses python's built-in
+[HTMLParser](https://docs.python.org/2/library/htmlparser.html) library for parsing help.
diff --git a/tools/linkdiff/linkdiff.py b/tools/linkdiff/linkdiff.py
new file mode 100644
index 0000000000..ae003058f2
--- /dev/null
+++ b/tools/linkdiff/linkdiff.py
@@ -0,0 +1,1529 @@
+# linkdiff.py
+# By Travis Leithead
+# 2016/10/05
+
+from HTMLParser import HTMLParser
+import sys
+import platform
+import os.path
+import codecs
+import json
+import urllib
+#import time
+import math
+from multiprocessing import Process, Pipe, Pool, Manager
+import re
+import multiprocessing
+
+# Subclass the parser to build the DOM described below. Since the
+# DOM will only be used for tracking links and what they link to, the
+# only retained nodes are potential link targets (Element objects)
+# and links (LinkElement), as well as all text nodes (TextNode).
+# Tree-structure is not important, as I only need to worry about what
+# text is "before" and "after" a given target. So the parser (as a depth-
+# first traversal of markup tags) will let me build a linear representation
+# of the start tags that matter and put the text in the right logical
+# order for comparison.
+class LinkAndTextHTMLParser(HTMLParser):
+ """Parses links and text from HTML"""
+ def handle_starttag(self, tag, attrs):
+ attrNames = [attr[0] for attr in attrs]
+ if tag == "a" and "href" in attrNames:
+ attrValues = [attr[1] for attr in attrs]
+ # an anchor may also have an id and be a link target as well.
+ hasId = ""
+ if "id" in attrNames:
+ hasId = attrValues[attrNames.index("id")]
+ link = LinkElement(self.linkCountIndex, attrValues[attrNames.index("href")], HTMLParser.getpos(self)[0], hasId )
+ self.linkCountIndex += 1
+ self._append_to_head(link)
+ self.doc.links.append(link)
+ if hasId != "":
+ self._append_to_map(hasId, link)
+ elif "id" in attrNames:
+ attrValues = [attr[1] for attr in attrs]
+ elemId = attrValues[attrNames.index("id")]
+ elem = Element(elemId)
+ self._append_to_head(elem)
+ self._append_to_map(elemId, elem)
+ else:
+ self.doc.droppedTags += 1
+
+ def handle_startendtag(self, tag, attrs):
+ self.handle_starttag(tag, attrs)
+
+ def handle_data(self, data):
+ text = TextNode(data)
+ self._append_to_head(text)
+
+ def handle_entityref(self, name):
+ self.handle_data("&"+name+";") #pass these through un-modified
+
+ def handle_charref(self, name):
+ self.handle_data(""+name+";")
+
+ def _append_to_head(self, node):
+ if self.head == None:
+ self.head = node
+ self.doc.start = node
+ else: #Hook up the bidirectional links
+ self.head.next = node
+ node.prev = self.head
+ self.head = node
+
+ def _append_to_map(self, key, node):
+ if key not in self.doc._idMap:
+ self.doc._idMap[key] = node
+
+ def parse(self, markup):
+ self.doc = Document()
+ self.linkCountIndex = 0
+ self.head = None
+ self.droppedTagCount = 0
+ HTMLParser.reset(self) # among other things, resets the line numbering :-)
+ HTMLParser.feed(self, markup)
+ HTMLParser.close(self)
+ self.head = None
+ doc = self.doc
+ self.doc = None
+ return doc
+
+# Document produced by the Parser has the following IDL
+
+# interface Document {
+# readonly attribute LinkElement[] links;
+# readonly attribute Node start;
+# TreeNode getElementById(str id);
+# readonly attribute unsigned long droppedTags;
+# };
+
+# interface Node {
+# readonly attribute Node? prev;
+# readonly attribute Node? next;
+# };
+
+# interface TextNode : Node {
+# readonly attribute str textContent;
+# };
+
+# only nodes with an ID are retained by the parser.
+# interface Element : Node {
+# readonly attribute str id; #reflects the id content attribute
+# };
+
+# interface LinkElement : Element {
+# readonly attribute unsigned long index;
+# attribute LinkTreeNodeStatus status;
+# readonly attribute str href;
+# attribute long matchIndex;
+# attribute double matchRatio;
+# attribute double correctRatio;
+# readonly attribute unsigned long lineNo;
+# };
+
+# enum LinkTreeNodeStatus = {
+# "non-matched",
+# "matched",
+# "correct",
+# "skipped",
+# "broken",
+# "non-matched-external",
+# "matched-external",
+# "correct-external"
+# };
+
+class Document:
+ def __init__(self):
+ self.links = []
+ self.start = None
+ self._idMap = {}
+ self.droppedTags = 0
+ #self.index #added during indexing! hash of "word" <-> [0:count, 1-n:link index]
+ #self.unIndexed #added during indexing! list of "words" too common to be useful in indexing.
+
+ def getElementById(self, id):
+ if id in self._idMap:
+ return self._idMap[id]
+ else:
+ return None
+
+class Node():
+ def __init__(self):
+ self.prev = None
+ self.next = None
+
+class TextNode(Node):
+ def __init__(self, initialText):
+ Node.__init__(self)
+ self.textContent = initialText
+ def __str__(self):
+ return "text<"+self.textContent[:40]+ ( "..." if len(self.textContent) > 40 else "" ) + "> (len:"+str(len(self.textContent))+")"
+
+class Element(Node):
+ def __init__(self, elemId):
+ Node.__init__(self)
+ self.id = elemId
+ self._cachedContextualText = None
+ def __str__(self):
+ return '{ "id":"' + self.id.encode('ascii', 'xmlcharrefreplace') + '" }' #because attrs have their entites handled by the parser, and ascii output may not handle them.
+
+class LinkElement(Element):
+ def __init__(self, index, href, lineNo, elemId):
+ Element.__init__(self, elemId)
+ self.index = index
+ self.href = href
+ self.lineNo = lineNo
+ #self.words #added during indexing!
+ self.status = "non-matched"
+ self.matchIndex = -1
+ self.matchRatio = 0.0
+ self.correctRatio = 0.0
+ def __str__(self): # called for str(link)
+ return '{"index":' + str(self.index) + ',"matchIndex":' + str(self.matchIndex) + ',"matchRatio":' + str(self.matchRatio)[:5] + ',"correctRatio":' + str(self.correctRatio)[:5] + ',"lineNo":' + str(self.lineNo) + ',"status":"' + self.status + '","href":"' + self.href.encode('ascii', 'xmlcharrefreplace') + '"' + (',"id":"' + self.id + '"' if self.id != '' else '') + '}'
+ def __getstate__(self): # called by pickle protocol (see when mem.baseAllLinks is set)
+ return {'index': self.index, 'matchIndex': self.matchIndex, 'matchRatio': self.matchRatio, 'correctRatio': self.correctRatio, 'lineNo': self.lineNo, 'status': self.status, 'href': self.href, 'id': self.id}
+
+def parseTextToDocument(htmlText, statusText = None):
+ parser = LinkAndTextHTMLParser()
+ if statusText != None:
+ statusUpdate(statusText)
+ return parser.parse(htmlText)
+
+# index is a hashtable of "name" <-> [n:matching link index, n+1:number of occurances of "name" at the matching index, ...]
+def buildIndex(doc, statusText = None):
+ if statusText != None:
+ statusUpdate(statusText)
+ doc.index = {}
+ doc.unIndexed = [] # because they're too common to be useful...
+ tooCommonThreshold = len(doc.links)
+ if len(doc.links) > 100:
+ tooCommonThreshold = tooCommonThreshold / 3 #if more than 1/3 of all links have this word, then it's too common!
+ # slice the text in the document up into words and attach (HALF_WORD_COUNT * 2) number of words to each link
+ for linkIndex in xrange(len(doc.links)):
+ link = doc.links[linkIndex]
+ wordsList = getDirectionalContextualWords(link, True) + getDirectionalContextualWords(link, False)
+ # Group duplicate word entries in the wordsList so that each word has an occurence count
+ uniqueWords = {}
+ for word in wordsList:
+ if word in uniqueWords:
+ uniqueWords[word] += 1
+ else:
+ uniqueWords[word] = 1
+ link.words = []
+ for uniqueWord in uniqueWords:
+ # Assemble local saved words into a structure similar to the index: ['word', occurence count, ...]
+ link.words.append(uniqueWord)
+ link.words.append(uniqueWords[uniqueWord])
+ # Build the index
+ if uniqueWord in doc.unIndexed:
+ continue # too common to be included.
+ if uniqueWord in doc.index:
+ doc.index[uniqueWord].append(linkIndex)
+ doc.index[uniqueWord].append(uniqueWords[uniqueWord])
+ if len(doc.index[uniqueWord]) / 2 > tooCommonThreshold:
+ doc.unIndexed.append(uniqueWord)
+ del doc.index[uniqueWord] # remove it from the index
+ else:
+ doc.index[uniqueWord] = [linkIndex, uniqueWords[uniqueWord]]
+ doc.statsWordsTooCommonCount = len(doc.unIndexed)
+ doc.statsUniqueWordCount = len(doc.index)
+ ave = 0
+ for key in doc.index:
+ ave += (len(doc.index[key]) / 2)
+ if len(doc.index) == 0:
+ doc.statsAverageCountPerWord = 0.0
+ else:
+ doc.statsAverageCountPerWord = ave / float(len(doc.index))
+
+# Process entry point
+# For a given list of words, find the matching (set of) index(es) in the provided index.
+# Returns an array of candidates that meet the MATCH_RATIO_THRESHOLD bar (>=). Consisting of
+# tuples (ratio, associatedIndex, associatedIndex) in preferential order from most preferred (index 0) to
+# least preferred.
+def StartBuildMatchResult(tuple):
+ wordList, otherIndex, otherNonIndexed, otherLinksLen, wordListOriginIndex, renderProgress, mem = tuple
+ setGlobals(mem)
+ possibleMatches = 0
+ for i in xrange(1, len(wordList), 2): #sum the [initial] total number of possible matches (the count of all non-unique words in the list)
+ possibleMatches += wordList[i]
+ allLinks = [0] * otherLinksLen # creates an array initialized with zeros
+ for i in xrange(0, len(wordList), 2):
+ word = wordList[i]
+ if word in otherNonIndexed: # skip and reduce the ratio threshold for any too-common words
+ possibleMatches -= wordList[i+1] # change can be merged into this loop because each word is unique
+ continue
+ if word in otherIndex:
+ linkIndexes = otherIndex[word] # around 250 on average (could be much smaller or a lot bigger)
+ for n in xrange(0, len(linkIndexes), 2):
+ allLinks[ linkIndexes[n] ] += min(wordList[i+1], linkIndexes[n+1]) # when dups are involved, only select from what is available at each link
+ assert allLinks[linkIndexes[n]] <= possibleMatches, "There cannot be a value greater than possible matches (word: " + word + ", read: " + str(allLinks[linkIndexes[n]]) + ", max: " + str(possibleMatches) + ") wordlist: " + str(wordList)
+ if possibleMatches == 0:
+ return [(0.0, -1, -1)]
+ matchValueThreshold = int(math.ceil(possibleMatches * MATCH_RATIO_THRESHOLD))
+ possibleMatches = float(possibleMatches) # convert to float so that later division is floating point
+ highestMatchValueFound = 0
+ bestMatchingIndex = -1
+ candidacyAchieved = False
+ candidates = [] # Only those that meet the bar
+ for i in xrange(otherLinksLen):
+ numMatchesOfI = allLinks[i]
+ if not candidacyAchieved and numMatchesOfI > highestMatchValueFound:
+ highestMatchValueFound = numMatchesOfI
+ bestMatchingIndex = i
+ if numMatchesOfI >= matchValueThreshold:
+ candidates.append((numMatchesOfI/possibleMatches, i, wordListOriginIndex))
+ candidacyAchieved = True
+ elif candidacyAchieved and numMatchesOfI >= matchValueThreshold:
+ candidates.append((numMatchesOfI/possibleMatches, i, wordListOriginIndex))
+ if not candidacyAchieved:
+ candidates.append((highestMatchValueFound/possibleMatches, bestMatchingIndex, -1))
+ # candidates.sort(key=itemgetter(0),reverse=True) # sorts based on 0th item in each tuple (biggest value first)
+ if renderProgress:
+ progress = mem.progress
+ progress += 1
+ mem.progress = progress # potentially racy... might loose progress if multiple processes read/write the value while overlapping
+ statusUpdateInline("matching... " + str(progress) + "%")
+ # Return the list of tuples (ratio, bestMatchingIndex)
+ return candidates
+
+# Performs the following: 1) in-place modifies the provided matchResultsArray to contain the result
+# set for the "own" links collection, (resolved hits and misses combined and in the cannonical order
+# AND 2) returns a sparce list for "near-matches" (the links potentially matching--with qualifying
+# ratios--but were not selected as the "official" match per this algorithm). The single tuple result
+# will be the tuple with the highest ratio if there were multiples.
+# That only leaves the set of links which were not matched at all in the "other" set un-analyzed
+# (matched and "near-matched" are handled here) which will have a 0.0 ratio--which is probably not
+# true. To get the best-match ratio for these unmatched links, the StartBuildMatchResult algorithm
+# must be run for each of them (with no expected "new" matches--just refined un-matched best-case
+# ratios).
+def resolveMatchResultConflicts(matchResultsArray):
+ # These two maps are used for eliminating match combinations w/out affecting the original array
+ rowResults = {}
+ colResults = {}
+ matchResultsArrayLen = len(matchResultsArray)
+ over50Count = 0 # Match resolving can be expensive. If a row has over 50 matches, that's a sure sign of potential slowness for the whole algorithm.
+ statusUpdate('\nResolving match conflicts...(this may take a few minutes)')
+ for i in xrange(matchResultsArrayLen):
+ if matchResultsArray[i][0][2] == -1:
+ matchResultsArray[i] = matchResultsArray[i][0]
+ else:
+ rowResults[i] = matchResultsArray[i]
+ if len(matchResultsArray[i]) >= 50:
+ over50Count += 1
+ for matchTuple in matchResultsArray[i]:
+ if matchTuple[1] not in colResults:
+ colResults[matchTuple[1]] = []
+ colResults[matchTuple[1]].append(matchTuple)
+ if matchResultsArrayLen > 1000 and over50Count > (matchResultsArrayLen / 10): # show this at >10% of all links
+ statusUpdate('**Note** ' + str(int(float(over50Count) / matchResultsArrayLen * 100)) + '% of all links have more than 50 match conflicts each.')
+ statusUpdate(' Consider increasing the match ratio to reduce match conflicts (via the -ratio command line flag).')
+ onePercent = matchResultsArrayLen / 100 if matchResultsArrayLen > 1000 else matchResultsArrayLen + 1
+ i = 0
+ count = 0
+ percent = 0
+ while i < matchResultsArrayLen:
+ if not i in rowResults: # Resolved in a previous iteration--move along without trying to resolve
+ i += 1
+ continue
+ # resolveMatchRow may not resolve the row it's on, but it is guaranteed to resolve one row somewhere.
+ if resolveMatchRow(i, rowResults, colResults, matchResultsArray):
+ i += 1 # resolved the row it was on. Move to next row.
+ count += 1 # spent time resolving a row.
+ if count % onePercent + 1 == onePercent:
+ percent += 1
+ statusUpdateInline("resolving... " + str(percent) + "%")
+ otherNearMatches = [] # fill-in for "near-matches" where no match was found in a column despite there being options for a potential match.
+ for colIndex in colResults.keys():
+ # find local maxiumum ratio among remaining options
+ biggestRatio = -0.1
+ biggestRowIndex = -1
+ for tuple in colResults[colIndex]:
+ if tuple[0] > biggestRatio:
+ biggestRatio = tuple[0]
+ biggestRowIndex = tuple[2]
+ otherNearMatches.append((biggestRatio, colIndex, biggestRowIndex))
+ return otherNearMatches
+
+# Returns true if the designated row was resolved; false if some other row was resolved.
+# in-place modifies both rowDict and colDict when a match occurs, both the related row/col dictionary
+# entry are removed; for rowDict this helps with later skipping an already-resolved row when iterating
+# the rowIndexes; for colDict this excludes columns from being considered for "near matches" after
+# all rows have been resolved.
+def resolveMatchRow(rowIndex, rowDict, colDict, finalMatchArray):
+ rowLen = len(rowDict[rowIndex])
+ assert rowLen != 0, 'If there is a row, it must have more than zero elements...'
+ colConstrained = False
+ rowConstrained = False
+ if rowLen == 1:
+ colIndex = rowDict[rowIndex][0][1]
+ colLen = len(colDict[colIndex])
+ assert len(colDict[colIndex]) > 0, "I don't think this array should ever be empty, if I do maintenance right"
+ for tuple in colDict[colIndex]:
+ if len(rowDict[tuple[2]]) > 1:
+ break
+ else:
+ colConstrained = True
+ for tuple in rowDict[rowIndex]:
+ assert len(colDict[tuple[1]]) > 0, "I don't think this array should ever be empty, if I do maintenance right"
+ if len(colDict[tuple[1]]) > 1:
+ break
+ else:
+ rowConstrained = True
+ if not rowConstrained and not colConstrained:
+ return resolveNonConstrainedMatches(rowIndex, rowDict, colDict, finalMatchArray)
+ elif rowConstrained and colConstrained:
+ finalMatchArray[rowIndex] = rowDict[rowIndex][0]
+ # Remove the colDict entry so that it is not checked later when gathering otherNearMatches
+ del colDict[rowDict[rowIndex][0][1]]
+ del rowDict[rowIndex]
+ return True
+ elif rowConstrained:
+ biggestRatio = rowDict[rowIndex][0][0]
+ biggestIndex = 0
+ for i in xrange(1, len(rowDict[rowIndex])):
+ if rowDict[rowIndex][i][0] > biggestRatio:
+ biggestRatio = rowDict[rowIndex][i][0]
+ biggestIndex = i
+ finalMatchArray[rowIndex] = rowDict[rowIndex][biggestIndex]
+ del colDict[ rowDict[rowIndex][biggestIndex][1] ]
+ del rowDict[rowIndex]
+ return True
+ else:
+ colIndex = rowDict[rowIndex][0][1]
+ biggestRatio = colDict[colIndex][0][0]
+ biggestIndex = 0
+ for i in xrange(1, len(colDict[colIndex])):
+ if colDict[colIndex][i][0] > biggestRatio:
+ biggestRatio = colDict[colIndex][i][0]
+ biggestIndex = i
+ for i in xrange(len(colDict[colIndex])):
+ rovingColumnTuple = colDict[colIndex][i]
+ rovingRowIndex = rovingColumnTuple[2]
+ finalMatchArray[rovingRowIndex] = (rovingColumnTuple[0], rovingColumnTuple[1], rovingColumnTuple[2] if i == biggestIndex else -1)
+ del rowDict[rovingRowIndex]
+ del colDict[colIndex]
+ return rowIndex == biggestIndex
+
+def resolveNonConstrainedMatches(anchorRowIndex, rowDict, colDict, finalMatchArray):
+ bestMatches = {}
+ bestMatches["highestRatio"] = -0.1
+ bestMatches["highestRowDict"] = bestMatches["highestColDict"] = None
+ # build constraining range + visit/test first row
+ constrainingColRange = {}
+ anchorColumns = []
+ for tuple in rowDict[anchorRowIndex]:
+ constrainingColRange[tuple[1]] = True
+ anchorColumns.append(tuple[1])
+ updateBestMatches(tuple, bestMatches)
+ visitedRow = {}
+ for colIndex in anchorColumns: # perf note: the 'in' expression is only evaluated once
+ # traverse each column
+ for colIterator in xrange(1, len(colDict[colIndex])): # skips the anchor row (handled earlier)
+ colTuple = colDict[colIndex][colIterator]
+ # Get the row of this column entry and iterate (if the row hasn't been visited)
+ if colTuple[2] in visitedRow:
+ continue # Skip this row
+ visitedRow[colTuple[2]] = True
+ # pre-scan for >= best ratio results that are out the constraining range. This is a pre-
+ # scan because I would otherwise need to roll-back the state of the highestCol/RowDict
+ # objects if they found a (legitimate) higher value before stumbling on the out-of-range
+ # option.
+ for preScanRowTuple in rowDict[colTuple[2]]:
+ if preScanRowTuple[0] >= bestMatches["highestRatio"] and not preScanRowTuple[1] in constrainingColRange:
+ break # invalidating this entire row
+ else: # loop-completed w/out breaking, row is safe.
+ for rowTuple in rowDict[colTuple[2]]:
+ updateBestMatches(rowTuple, bestMatches)
+ # This has a stable ascending sort for ordinal keys, so regardless of the order they were added,
+ # they will be processed in the correct order.
+ highestRowDict = bestMatches["highestRowDict"]
+ highestColDict = bestMatches["highestColDict"]
+ if len(highestRowDict) == 1 and len(highestColDict) == 1:
+ rowIndex = highestRowDict.keys()[0]
+ selectAndRemoveFromNonConstrainedMatches(rowIndex, highestColDict.keys()[0], rowDict, colDict, finalMatchArray)
+ return anchorRowIndex == rowIndex
+ # check each entry, row-by-row for only entry in its row or col. If so, select it and quit.
+ rowKeys = highestRowDict.keys()
+ rowKeys.sort()
+ for rowKey in rowKeys:
+ for tuple in highestRowDict[rowKey]:
+ if len(highestColDict[tuple[1]]) == 1 or len(highestRowDict[tuple[2]]) == 1:
+ selectAndRemoveFromNonConstrainedMatches(tuple[2], tuple[1], rowDict, colDict, finalMatchArray)
+ return anchorRowIndex == tuple[2]
+ # Stalemate. Pick row 0, first item.
+ tuple = highestRowDict[rowKeys[0]][0]
+ selectAndRemoveFromNonConstrainedMatches(tuple[2], tuple[1], rowDict, colDict, finalMatchArray)
+ return anchorRowIndex == tuple[2]
+
+def updateBestMatches(tuple, best):
+ if tuple[0] > best["highestRatio"]:
+ best["highestRowDict"] = {}
+ best["highestRowDict"][tuple[2]] = [tuple]
+ best["highestColDict"] = {}
+ best["highestColDict"][tuple[1]] = [tuple]
+ best["highestRatio"] = tuple[0]
+ elif tuple[0] == best["highestRatio"]:
+ if tuple[2] not in best["highestRowDict"]:
+ best["highestRowDict"][tuple[2]] = []
+ if tuple[1] not in best["highestColDict"]:
+ best["highestColDict"][tuple[1]] = []
+ best["highestRowDict"][tuple[2]].append(tuple)
+ best["highestColDict"][tuple[1]].append(tuple)
+
+def selectAndRemoveFromNonConstrainedMatches(rowIndex, colIndex, rowDict, colDict, finalMatchArray):
+ selectedRow = rowDict[rowIndex]
+ selectedCol = colDict[colIndex]
+ # remove any column results from this matched row...
+ for tuple in selectedRow:
+ if tuple[1] == colIndex:
+ finalMatchArray[rowIndex] = tuple
+ else:
+ col = colDict[tuple[1]]
+ for i in xrange(len(col)):
+ if tuple == col[i]:
+ del col[i]
+ break
+ # for each column, may need to remove isolated non-matching row entries, so they are not visited
+ # later (they can't be matched).
+ for tuple in selectedCol:
+ if tuple[2] != rowIndex:
+ row = rowDict[tuple[2]]
+ for i in xrange(len(row)):
+ if tuple == row[i]:
+ if len(row) == 1: #don't leave a row vacant as a result
+ finalMatchArray[row[i][2]] = (row[i][0], row[i][1], -1)
+ del rowDict[row[i][2]]
+ else:
+ del row[i]
+ break
+ del colDict[colIndex] # prevents searching this column for "near matches" later
+ del rowDict[rowIndex]
+
+def applyOwnMatchArray(ownMatchResultsArray, ownLinks):
+ assert len(ownMatchResultsArray) == len(ownLinks), 'Baseline and matched lists must have the same length'
+ matchesCount = 0
+ for rowIndex in xrange(len(ownMatchResultsArray)):
+ ratio, otherIndex, ownIndex = ownMatchResultsArray[rowIndex]
+ link = ownLinks[rowIndex]
+ link.matchRatio = ratio
+ link.matchIndex = -1 if ownIndex == -1 else otherIndex
+ if ownIndex != -1:
+ link.status = 'matched'
+ matchesCount += 1
+ return matchesCount
+
+def applyOtherMatchArray(otherMatchResultsArray, nearMissesList, ownLinks):
+ matchesCount = 0
+ for tuple in otherMatchResultsArray:
+ ratio, ownIndex, otherIndex = tuple
+ if ownIndex != -1: # entries with no possible matches are (0.0, -1, -1), these have no meaningful info for me at all.
+ link = ownLinks[ownIndex]
+ link.matchRatio = ratio
+ link.matchIndex = otherIndex
+ if otherIndex != -1:
+ link.status = 'matched'
+ matchesCount += 1
+ # Apply ratio info from the near-misses list
+ for tuple in nearMissesList:
+ ratio, ownIndex, missedValue = tuple
+ ownLinks[ownIndex].matchRatio = ratio
+ # sadly, not matched though...
+ return matchesCount
+
+# If the paramter is true, returns a tuple where
+# 0 - skipped total
+# 1 - [ (otherIndex, hrefValue) ]
+# 2 - [ (otherIndex, [words]) ]
+# and "otherIndex" are indexes in the other document's link index (from matchIndex)
+def preCheck4Correct(doc, generateOtherLists = False):
+ otherExternal = []
+ otherWords = []
+ skippedTotal = 0
+ for link in doc.links:
+ if link.status != "matched":
+ if check4External(link):
+ link.status = "non-matched-external"
+ continue
+ if link.href in IGNORE_LIST:
+ link.status = "skipped"
+ skippedTotal += 1
+ continue
+ if check4External(link):
+ link.status = "matched-external"
+ # won't know if this is correct until cross-checking with the other process
+ if generateOtherLists:
+ otherExternal.append((link.matchIndex, link.href))
+ continue
+ hrefTarget = doc.getElementById(getLinkTarget(link.href))
+ if hrefTarget == None:
+ link.status = "broken"
+ continue
+ words = getDirectionalContextualWords(hrefTarget, True) + getDirectionalContextualWords(hrefTarget, False)
+ if generateOtherLists:
+ otherWords.append((link.matchIndex,words))
+ else: #in-place update the word list to be the target's word list!
+ link.words = words
+ return (skippedTotal, otherExternal, otherWords)
+
+# returns an array of results for each provided array as:
+# 0 - total own correct
+# 1 - [indexes of potentially correct external links]
+# 2 - [(indexOfCorrectLink,correctRatio)]
+# where the indexes are per the other document
+def check4Correct(doc, otherExternal, otherWords):
+ correctExternals = []
+ correctWords = []
+ for externTuple in otherExternal:
+ index, href = externTuple
+ link = doc.links[index]
+ if link.href == href:
+ correctExternals.append(link.matchIndex)
+ link.status = 'correct-external'
+ link.correctRatio = 1.0
+ totalOwnCorrect = len(correctExternals)
+ for wordTuple in otherWords:
+ index, words = wordTuple
+ link = doc.links[index]
+ ownWords = link.words
+ wordsToOwnWordsRatio = getRatio(words, ownWords)
+ # If the lengths of the two lists are the same, then the same ratio of matches is interchangable
+ # e.g., for two lists with 4 items, if only 2 items match from one list to the other, more than
+ # 2 items cannot match if the lists were interchanged. This is not true when the lists have different
+ # lenghts, as the matching ratios can be different...
+ if len(words) == len(ownWords):
+ link.correctRatio = wordsToOwnWordsRatio
+ if wordsToOwnWordsRatio >= MATCH_RATIO_THRESHOLD:
+ link.status = 'correct'
+ totalOwnCorrect += 1
+ correctWords.append((link.matchIndex, wordsToOwnWordsRatio))
+ continue
+ link.correctRatio = getRatio(ownWords, words)
+ if link.correctRatio >= MATCH_RATIO_THRESHOLD:
+ link.status = 'correct'
+ totalOwnCorrect += 1
+ correctWords.append((link.matchIndex, wordsToOwnWordsRatio))
+ return (totalOwnCorrect, correctExternals, correctWords)
+
+def getRatio(ownWords, otherWords):
+ otherWordsNoDup = [ {'word': w, 'notused': True} for w in otherWords ]
+ found = 0
+ for word in ownWords:
+ for otherWord in otherWordsNoDup:
+ if otherWord['notused'] and word == otherWord['word']:
+ found += 1
+ otherWord['notused'] = False
+ break
+ if len(ownWords) == 0:
+ return 0.0
+ return found / float(len(ownWords))
+
+def applyCorrectnessResults(doc, externalCorrectList, wordCorrectList):
+ for i in externalCorrectList:
+ doc.links[i].status = 'correct-external'
+ doc.links[i].correctRatio = 1.0
+ for tuple in wordCorrectList:
+ link = doc.links[tuple[0]]
+ link.status = 'correct'
+ link.correctRatio = tuple[1]
+ return len(externalCorrectList) + len(wordCorrectList)
+
+# get HALF_WORD_COUNT words (or less if only less is available) in the indicated direction
+def getDirectionalContextualWords(elem, isBeforeText):
+ textCount = HALF_CONTEXT_MIN # should be enough, but if not, grow this variable.
+ wordCount = 0
+ #since lead or tail text may cut off a word (in the middle of a whole word), ask for one more word than needed and drop the potential half-word)
+ while wordCount < HALF_WORD_COUNT: # Should loop only once in typical cases...
+ text, noMoreTextAvailable = getDirectionalContextualText(elem, isBeforeText, textCount)
+ splitArray = re.split('\\W+', text)
+ headPos = 0
+ tailPos = len(splitArray)
+ # discount empty matches at the beginning/end of the array (the nature of 're.split')
+ if tailPos > 0 and splitArray[0] == "":
+ headPos = 1
+ if tailPos > 1 and splitArray[-1] == "":
+ tailPos = -1
+ splitArray = splitArray[headPos:tailPos]
+ if noMoreTextAvailable and len(splitArray) < HALF_WORD_COUNT: # There just isn't any more text; Call it good enough.
+ if isBeforeText:
+ return [word.lower() for word in splitArray[1:]] #drop the leading word, which is likely cut-off.
+ else:
+ return [word.lower() for word in splitArray[:-1]] #drop the trailing word, which is likely cut-off.
+ wordCount = len(splitArray)
+ textCount += 120 # growth factor on retry
+ # word count met or exceeded HALF_WORD_COUNT threshold; trim and return
+ if isBeforeText: #use list comprehension to lowercase each word in the list.
+ return [word.lower() for word in splitArray[-HALF_WORD_COUNT:]] #back HALF_WORD_COUNT from the end, to the end.
+ else:
+ return [word.lower() for word in splitArray[:HALF_WORD_COUNT]] # 0 to HALF_WORD_COUNT (exclusive)
+
+# Returns a tuple of the requested text and a flag indicating whether more text is available to process.
+def getDirectionalContextualText(elem, isBeforeText, characterLimit):
+ text = ''
+ count = 0
+ runner = elem
+ while count < characterLimit and runner != None:
+ if isinstance(runner, TextNode):
+ if isBeforeText:
+ text = runner.textContent + text
+ else: #after text
+ text += runner.textContent
+ count += len(runner.textContent)
+ runner = runner.prev if isBeforeText else runner.next
+ noMoreTextAvailable = (runner == None and count < characterLimit) # not enough characters accumulated!
+ if isBeforeText:
+ return (text[-characterLimit:], noMoreTextAvailable)
+ else:
+ return (text[:characterLimit], noMoreTextAvailable)
+
+def check4External(link):
+ if link.href[0:1] != '#':
+ return True
+ return False
+
+def getLinkTarget(href):
+ return urllib.unquote(href[1:])
+
+# Validation testing
+# =====================================================
+
+def dumpDocument(doc, enumAll=False):
+ print "----------------"
+ print "Document summary"
+ print "----------------"
+ print "droppedTags: " + str(doc.droppedTags)
+ print "number of links in collection: " + str(len(doc.links))
+ print "number of addressable ids: " + str(len(doc._idMap.keys()))
+ if enumAll == True:
+ print "enumeration of nodes in start:"
+ head = doc.start
+ counter = 0
+ while head != None:
+ if enumAll == True:
+ print " " + str(counter) + ") " + str(head)
+ head = head.next
+ counter += 1
+ print "total nodes in document: " + str(counter)
+
+def getAndCompareRatio(elem1, elem2):
+ list1 = getDirectionalContextualWords(elem1, True) + getDirectionalContextualWords(elem1, False)
+ list2 = getDirectionalContextualWords(elem2, True) + getDirectionalContextualWords(elem2, False)
+ return getRatio(list1, list2)
+
+def getContextualText(elem):
+ combinedTextBefore, nomore = getDirectionalContextualText(elem, True, 150)
+ combinedTextAfter, nomore = getDirectionalContextualText(elem, False, 150)
+ return combinedTextBefore + combinedTextAfter
+
+def runTests(mem):
+ mem.ignoreList = {'http://test/test/test.com': True}
+ mem.progress = 0
+ setGlobals(mem)
+ # test 1
+ parser = LinkAndTextHTMLParser()
+ doc = parser.parse("
first target
goto lastalternate last. This is some content. And here is some links: goto firstlast target
' + res = diffLinksWithMarkupText(markup1, markup1, mem) + #dumpJSONResults(res) + assert res.baseAllLinks[0].href == '#last()', 'test11: no fancy escaping done to these characters by the HTMLParser implementation.' + assert res.baseAllLinks[0].status == 'broken', 'test11: percent-encoded attribute values in id are not converted to match.' + assert res.baseAllLinks[1].href == '#last%28%29', 'test11: no fancy escaping done to percent-encoded characters by the HTMLParser implementation.' + assert res.baseAllLinks[1].status == 'broken', 'test11: href values are always decoded before checking for literal matching ids (see note on Chrome above)' + assert res.baseAllLinks[2].status == 'correct', 'test11: percent-encoded attribute values in hrefs are decoded to match.' + + # test 12 - new indexing technique + markup1 = "One of the common, difficult to figure-out problems in the \n" + markup1 += "current HTML spec is whether links are 'correct'. Not 'correct' as in syntax or as \n" + # -10 -9 + markup1 += "opposite to broken links, but rather that the link in question goes to the semantically\n" + # -8 -7 -6 -5 -4 -3 -2 -1 1 + markup1 += "correct place in the Spec or other linked spec. \n" + # 2 3 4 5 6 7 8 9 10 + markup1 += "Correctness, in this sense, can only be determined by comparing \n" + markup1 += "the links to a canonical 'correct' source. In the case of the W3C HTML spec, the \n" + markup1 += "source used for determining correctness is the WHATWG version of the spec." + doc = parseTextToDocument(markup1) + #dumpDocument(doc, True) + resultWordList = getDirectionalContextualWords(doc.links[0], True) + assert len(resultWordList) == HALF_WORD_COUNT, "test12: getDirectionalContextualWords returns "+str(HALF_WORD_COUNT)+" items from front of link" + testList = ['the', 'semantically', 'correct', 'place', 'in', 'the', 'spec', 'or', 'other', 'linked'] + for i in xrange(len(testList)): + assert testList[i] == resultWordList[i], "test12: validating expected words before link" + resultWordList = getDirectionalContextualWords(doc.links[0], False) + assert len(resultWordList) == HALF_WORD_COUNT, "test12: getDirectionalContextualWords returns "+str(HALF_WORD_COUNT)+" items from back of link" + testList = ['spec', 'correctness', 'in', 'this', 'sense', 'can', 'only', 'be', 'determined', 'by'] + for i in xrange(len(testList)): + assert testList[i] == resultWordList[i], "test12: validating expected words after link" + buildIndex(doc) + assert len(doc.index) == 17, "test12: total number of unique words indexed is 17" + testList = ['be',1, 'or',1, 'this',1, 'the',2, 'in',2, 'correctness',1, 'spec',2, 'by',1, 'only',1, 'other',1, 'place',1, 'can',1, 'sense',1, 'correct',1, 'semantically',1, 'determined',1, 'linked',1] + for i in xrange(0, len(testList), 2): + assert testList[i] in doc.index, "test12: only expected words are in the index" + indexlist = doc.index[testList[i]] + assert indexlist[0] == 0, "test12: all indexed words belong to link 0" + assert indexlist[1] == testList[i+1], "test12: word counts for each indexed entry are correct" + + # test 13 - duplicate words don't cause match overflow + markup1 = "aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa\n" + markup1 += "aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa" + res = diffLinksWithMarkupText(markup1, markup1, mem) + #dumpJSONResults(res) + assert len(res.baseAllLinks) == 2, 'test13: parsing validation-- 2 links in markup1' + assert res.baseAllLinks[0].status == 'correct-external', 'test13: link matching validation: external link is correctly matched' + assert res.baseAllLinks[0].matchIndex == 0, 'test13: link matching validation: matched at 0' + assert res.srcAllLinks[res.baseAllLinks[0].matchIndex].href == 'http://external', 'test13: correct index (0) matched in source doc' + assert res.baseAllLinks[0].matchRatio > 0.99, 'test13: link matching validation: Ratio is 1.0' + assert res.baseAllLinks[0].correctRatio == 1.0, 'test13: link matching validation: value is 1.0' + assert res.baseAllLinks[0].lineNo == 1, 'test13: line number is correct (expected: 1)' + assert res.baseAllLinks[1].status == 'correct-external', 'test13: link matching validation: external link is correctly matched' + assert res.baseAllLinks[1].matchIndex == 1, 'test13: link matching validation: matched at 1' + assert res.srcAllLinks[res.baseAllLinks[1].matchIndex].href == 'http://place', 'test13: correct link (1) matched in source doc' + assert res.baseAllLinks[1].matchRatio > 0.99, 'test13: link matching validation: Ratio is 1.0' + assert res.baseAllLinks[1].correctRatio == 1.0, 'test13: link matching validation: correct--1.0 ratio' + assert res.baseAllLinks[1].lineNo == 2, 'test13: line number is correct (expected: 2)' + + print 'All tests passed' + +# Input processing +# ===================================================== + +def cmdSimpleHelp(): + print "Usage:" + print " linkdiff [flags]