Skip to content

Commit

Permalink
Bug 1712928 - Gather telemetry about encoding-unlabeled pages and abo…
Browse files Browse the repository at this point in the history
…ut Repair Text Encoding usage situations. r=emk

In particular, gather telemetry to evaluate the impact of unlabeled UTF-8
and how detector-triggered reloads would change if ASCII-only at initial
guess was treated as UTF-8.

Differential Revision: https://phabricator.services.mozilla.com/D140818
  • Loading branch information
hsivonen committed Mar 29, 2022
1 parent f3508af commit 8dca2aa
Show file tree
Hide file tree
Showing 9 changed files with 219 additions and 150 deletions.
188 changes: 115 additions & 73 deletions docshell/base/nsDocShell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
#include "nsIDocShellTreeItem.h"
#include "nsIDocShellTreeOwner.h"
#include "mozilla/dom/Document.h"
#include "nsHTMLDocument.h"
#include "nsIDocumentLoaderFactory.h"
#include "nsIDOMWindow.h"
#include "nsIEditingSession.h"
Expand Down Expand Up @@ -1655,91 +1656,132 @@ nsDocShell::ForceEncodingDetection() {

mForcedAutodetection = true;

LOGCHARSETMENU(("ENCODING_OVERRIDE_USED_AUTOMATIC"));
Telemetry::ScalarSet(Telemetry::ScalarID::ENCODING_OVERRIDE_USED_AUTOMATIC,
true);

nsIURI* url = doc->GetOriginalURI();
bool isFileURL = url && SchemeIsFile(url);

int32_t charsetSource = doc->GetDocumentCharacterSetSource();
auto encoding = doc->GetDocumentCharacterSet();
switch (charsetSource) {
case kCharsetFromInitialUserForcedAutoDetection:
case kCharsetFromFinalUserForcedAutoDetection:
LOGCHARSETMENU(("AutoOverridden"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::AutoOverridden);
break;
case kCharsetFromInitialAutoDetectionASCII:
// Deliberately no final version
LOGCHARSETMENU(("UnlabeledAscii"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledAscii);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
LOGCHARSETMENU(("UnlabeledNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledNonUtf8);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
LOGCHARSETMENU(("UnlabeledNonUtf8TLD"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledNonUtf8TLD);
break;
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8:
LOGCHARSETMENU(("UnlabeledUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::UnlabeledUtf8);
break;
case kCharsetFromChannel:
if (encoding == UTF_8_ENCODING) {
LOGCHARSETMENU(("ChannelUtf8"));
// AsHTMLDocument is valid, because we called
// WillIgnoreCharsetOverride() above.
if (doc->AsHTMLDocument()->IsPlainText()) {
switch (charsetSource) {
case kCharsetFromInitialAutoDetectionASCII:
// Deliberately no final version
LOGCHARSETMENU(("TEXT:UnlabeledAscii"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::ChannelUtf8);
} else {
LOGCHARSETMENU(("ChannelNonUtf8"));
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::UnlabeledAscii);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII:
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII:
LOGCHARSETMENU(("TEXT:UnlabeledNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::ChannelNonUtf8);
}
break;
case kCharsetFromXmlDeclaration:
case kCharsetFromMetaTag:
if (isFileURL) {
LOGCHARSETMENU(("LocalLabeled"));
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::
UnlabeledNonUtf8);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII:
LOGCHARSETMENU(("TEXT:UnlabeledNonUtf8TLD"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::LocalLabeled);
} else if (encoding == UTF_8_ENCODING) {
LOGCHARSETMENU(("MetaUtf8"));
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::
UnlabeledNonUtf8TLD);
break;
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII:
LOGCHARSETMENU(("TEXT:UnlabeledUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::MetaUtf8);
} else {
LOGCHARSETMENU(("MetaNonUtf8"));
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::UnlabeledUtf8);
break;
case kCharsetFromChannel:
if (encoding == UTF_8_ENCODING) {
LOGCHARSETMENU(("TEXT:ChannelUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::ChannelUtf8);
} else {
LOGCHARSETMENU(("TEXT:ChannelNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::
ChannelNonUtf8);
}
break;
default:
LOGCHARSETMENU(("TEXT:Bug"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::MetaNonUtf8);
}
break;
case kCharsetFromFinalAutoDetectionFile:
if (isFileURL) {
LOGCHARSETMENU(("LocalUnlabeled"));
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_TEXT::Bug);
break;
}
} else {
switch (charsetSource) {
case kCharsetFromInitialAutoDetectionASCII:
// Deliberately no final version
LOGCHARSETMENU(("HTML:UnlabeledAscii"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::LocalUnlabeled);
} else {
LOGCHARSETMENU(("Bug"));
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::UnlabeledAscii);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII:
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Content:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII:
LOGCHARSETMENU(("HTML:UnlabeledNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::Bug);
}
break;
default:
LOGCHARSETMENU(("Bug"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_2::Bug);
break;
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::
UnlabeledNonUtf8);
break;
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII:
LOGCHARSETMENU(("HTML:UnlabeledNonUtf8TLD"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::
UnlabeledNonUtf8TLD);
break;
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII:
LOGCHARSETMENU(("HTML:UnlabeledUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::UnlabeledUtf8);
break;
case kCharsetFromChannel:
if (encoding == UTF_8_ENCODING) {
LOGCHARSETMENU(("HTML:ChannelUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::ChannelUtf8);
} else {
LOGCHARSETMENU(("HTML:ChannelNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::
ChannelNonUtf8);
}
break;
case kCharsetFromXmlDeclaration:
case kCharsetFromMetaTag:
if (isFileURL) {
LOGCHARSETMENU(("HTML:LocalLabeled"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::LocalLabeled);
} else if (encoding == UTF_8_ENCODING) {
LOGCHARSETMENU(("HTML:MetaUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::InternalUtf8);
} else {
LOGCHARSETMENU(("HTML:MetaNonUtf8"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::
InternalNonUtf8);
}
break;
default:
LOGCHARSETMENU(("HTML:Bug"));
Telemetry::AccumulateCategorical(
Telemetry::LABELS_ENCODING_OVERRIDE_SITUATION_HTML::Bug);
break;
}
}
return NS_OK;
}
Expand Down
2 changes: 1 addition & 1 deletion dom/html/nsHTMLDocument.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ bool nsHTMLDocument::WillIgnoreCharsetOverride() {
case kCharsetFromDocTypeDefault:
case kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8:
case kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8:
case kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII:
case kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD:
case kCharsetFromParentFrame:
case kCharsetFromXmlDeclaration:
Expand Down
5 changes: 3 additions & 2 deletions layout/base/nsDocumentViewer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2586,9 +2586,10 @@ nsDocumentViewer::SetReloadEncodingAndSource(const Encoding* aEncoding,
int32_t aSource) {
MOZ_ASSERT(
aSource == kCharsetUninitialized ||
(aSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
(aSource >=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
aSource <=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) ||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII) ||
aSource == kCharsetFromFinalUserForcedAutoDetection);
mReloadEncoding = aEncoding;
mReloadEncodingSource = aSource;
Expand Down
49 changes: 33 additions & 16 deletions parser/html/nsHtml5StreamParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,6 @@ nsHtml5StreamParser::nsHtml5StreamParser(nsHtml5TreeOpExecutor* aExecutor,
mInitialEncodingWasFromParentFrame(false),
mHasHadErrors(false),
mDetectorHasSeenNonAscii(false),
mDetectorHadOnlySeenAsciiWhenFirstGuessing(false),
mDecodingLocalFileWithoutTokenizing(false),
mBufferingBytes(false),
mFlushTimer(NS_NewTimer(mEventTarget)),
Expand Down Expand Up @@ -290,20 +289,22 @@ nsresult nsHtml5StreamParser::GetChannel(nsIChannel** aChannel) {

std::tuple<NotNull<const Encoding*>, nsCharsetSource>
nsHtml5StreamParser::GuessEncoding(bool aInitial) {
if (aInitial) {
if (!mDetectorHasSeenNonAscii) {
mDetectorHadOnlySeenAsciiWhenFirstGuessing = true;
}
}
MOZ_ASSERT(
mCharsetSource != kCharsetFromFinalUserForcedAutoDetection &&
mCharsetSource != kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Generic &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD &&
mCharsetSource !=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII &&
mCharsetSource != kCharsetFromFinalAutoDetectionFile);
auto ifHadBeenForced = mDetector->Guess(EmptyCString(), true);
auto encoding =
Expand All @@ -330,13 +331,28 @@ nsHtml5StreamParser::GuessEncoding(bool aInitial) {
} else if (!mDetectorHasSeenNonAscii) {
source = kCharsetFromInitialAutoDetectionASCII; // deliberately Initial
} else if (ifHadBeenForced == UTF_8_ENCODING) {
// XXX subdivide by mDetectorHadOnlySeenAsciiWhenFirstGuessing in
// follow-up Not doing now to scope down the telemetry data review.
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8;
MOZ_ASSERT(mCharsetSource == kCharsetFromInitialAutoDetectionASCII ||
mCharsetSource ==
kCharsetFromInitialAutoDetectionWouldHaveBeenUTF8);
source = kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII;
} else if (encoding != ifHadBeenForced) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII;
} else {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD;
}
} else if (EncodingDetector::TldMayAffectGuess(mTLD)) {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8ContentInitialWasASCII;
} else {
source = kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8Content;
}
} else if (mCharsetSource == kCharsetFromInitialAutoDetectionASCII) {
source =
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8GenericInitialWasASCII;
}
} else if (source ==
kCharsetFromInitialAutoDetectionWouldNotHaveBeenUTF8Generic) {
Expand Down Expand Up @@ -485,7 +501,8 @@ nsresult nsHtml5StreamParser::SniffStreamBytes(Span<const uint8_t> aFromSegment,
MOZ_ASSERT(IsParserThread(), "Wrong thread!");
MOZ_ASSERT_IF(aEof, aFromSegment.IsEmpty());

if (mCharsetSource >= kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
if (mCharsetSource >=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
mCharsetSource <= kCharsetFromFinalUserForcedAutoDetection) {
if (mMode == PLAIN_TEXT || mMode == VIEW_SOURCE_PLAIN) {
mTreeBuilder->MaybeComplainAboutCharset("EncDetectorReloadPlain", true,
Expand Down Expand Up @@ -2273,9 +2290,9 @@ void nsHtml5StreamParser::ParseAvailableData() {
// Request a reload from the docshell.
MOZ_ASSERT(
(source >=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
source <=
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLD) ||
kCharsetFromFinalAutoDetectionWouldNotHaveBeenUTF8DependedOnTLDInitialWasASCII) ||
source == kCharsetFromFinalUserForcedAutoDetection);
mTreeBuilder->NeedsCharsetSwitchTo(encoding, source, 0);
requestedReload = true;
Expand Down Expand Up @@ -2323,7 +2340,7 @@ void nsHtml5StreamParser::ParseAvailableData() {
} else if (
mCharsetSource >= kCharsetFromXmlDeclaration &&
!(mCharsetSource >=
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8 &&
kCharsetFromFinalAutoDetectionWouldHaveBeenUTF8InitialWasASCII &&
mCharsetSource <=
kCharsetFromFinalUserForcedAutoDetection)) {
mTreeBuilder->MaybeComplainAboutCharset("EncError", true, 0);
Expand Down
2 changes: 0 additions & 2 deletions parser/html/nsHtml5StreamParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -711,8 +711,6 @@ class nsHtml5StreamParser final : public nsISupports {

bool mDetectorHasSeenNonAscii;

bool mDetectorHadOnlySeenAsciiWhenFirstGuessing;

/**
* If true, we are decoding a local file that lacks an encoding
* declaration and we are not tokenizing yet.
Expand Down
Loading

0 comments on commit 8dca2aa

Please sign in to comment.