diff --git a/src/Elastic.Documentation.Configuration/ReleaseNotes/ReleaseNotesSerialization.cs b/src/Elastic.Documentation.Configuration/ReleaseNotes/ReleaseNotesSerialization.cs index 640aff2d3..6cb726387 100644 --- a/src/Elastic.Documentation.Configuration/ReleaseNotes/ReleaseNotesSerialization.cs +++ b/src/Elastic.Documentation.Configuration/ReleaseNotes/ReleaseNotesSerialization.cs @@ -8,6 +8,7 @@ using System.Text.RegularExpressions; using Elastic.Documentation.Configuration.Serialization; using Elastic.Documentation.ReleaseNotes; +using Elastic.Documentation.Text; using YamlDotNet.Serialization; using YamlDotNet.Serialization.NamingConventions; @@ -50,6 +51,7 @@ public static partial class ReleaseNotesSerialization /// public static ChangelogEntry DeserializeEntry(string yaml) { + yaml = Utf8TextNormalization.StripLeadingUtf8Bom(yaml)!; var yamlDto = YamlDeserializer.Deserialize(yaml); return ToEntry(yamlDto); } @@ -71,6 +73,7 @@ public static ChangelogEntry DeserializeEntry(string yaml) /// public static Bundle DeserializeBundle(string yaml) { + yaml = Utf8TextNormalization.StripLeadingUtf8Bom(yaml)!; var yamlDto = YamlDeserializer.Deserialize(yaml); return ToBundle(yamlDto); } @@ -364,6 +367,7 @@ private static ChangelogEntryType ParseEntryType(string? value) /// The normalized YAML content. public static string NormalizeYaml(string yaml) { + yaml = Utf8TextNormalization.StripLeadingUtf8Bom(yaml)!; // Skip comment lines var yamlLines = yaml.Split('\n'); var yamlWithoutComments = string.Join('\n', yamlLines.Where(line => !line.TrimStart().StartsWith('#'))); diff --git a/src/Elastic.Documentation/Text/Utf8TextNormalization.cs b/src/Elastic.Documentation/Text/Utf8TextNormalization.cs new file mode 100644 index 000000000..049a5af62 --- /dev/null +++ b/src/Elastic.Documentation/Text/Utf8TextNormalization.cs @@ -0,0 +1,57 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +namespace Elastic.Documentation.Text; + +/// +/// UTF-8 text normalization utilities for handling Byte Order Marks (BOMs) and related text encoding concerns. +/// +public static class Utf8TextNormalization +{ + /// + /// UTF-8 Byte Order Mark character (U+FEFF Zero Width No-Break Space). + /// + public const char Utf8BomChar = '\uFEFF'; + + /// + /// UTF-8 Byte Order Mark byte sequence (EF BB BF). + /// + public static readonly byte[] Utf8BomBytes = [0xEF, 0xBB, 0xBF]; + + /// + /// Strips all consecutive leading UTF-8 BOM characters (U+FEFF) from the beginning of a string. + /// + /// This method removes the UTF-8 Byte Order Mark / Zero Width No-Break Space character only. + /// It does NOT strip other zero-width characters like U+200B (Zero Width Space) or U+2060 (Word Joiner) + /// as they can appear in legitimate content and are not part of the UTF-8 BOM sequence. + /// + /// + /// The input string, which may be null or empty. + /// The string with leading BOM characters removed, or the original string if null/empty or no BOM present. + public static string? StripLeadingUtf8Bom(string? text) + { + if (string.IsNullOrEmpty(text)) + return text; + + // Strip all consecutive leading U+FEFF characters + var span = text.AsSpan(); + while (span.Length > 0 && span[0] == Utf8BomChar) + { + span = span[1..]; + } + + return span.Length == text.Length ? text : span.ToString(); + } + + /// + /// Checks if the given byte span starts with the UTF-8 Byte Order Mark sequence (EF BB BF). + /// + /// The byte span to check. + /// True if the span starts with the UTF-8 BOM sequence, false otherwise. + public static bool HasUtf8Bom(ReadOnlySpan bytes) => + bytes.Length >= 3 && + bytes[0] == 0xEF && + bytes[1] == 0xBB && + bytes[2] == 0xBF; +} diff --git a/src/services/Elastic.Changelog/Bundling/ChangelogBundleAmendService.cs b/src/services/Elastic.Changelog/Bundling/ChangelogBundleAmendService.cs index a744c8a22..4120b13a2 100644 --- a/src/services/Elastic.Changelog/Bundling/ChangelogBundleAmendService.cs +++ b/src/services/Elastic.Changelog/Bundling/ChangelogBundleAmendService.cs @@ -7,6 +7,7 @@ using System.Text; using System.Text.RegularExpressions; using Elastic.Changelog.Configuration; +using Elastic.Changelog.Utilities; using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Assembler; using Elastic.Documentation.Configuration.Changelog; @@ -50,6 +51,11 @@ public partial class ChangelogBundleAmendService( ScopedFileSystem? fileSystem = null, IConfigurationContext? configurationContext = null) : IService { + /// + /// UTF-8 encoding without BOM for writing YAML files. + /// + private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false); + private readonly ILogger _logger = logFactory.CreateLogger(); private readonly IFileSystem _fileSystem = fileSystem ?? FileSystemFactory.RealRead; private readonly ChangelogConfigurationLoader? _configLoader = configurationContext != null @@ -256,7 +262,9 @@ public async Task AmendBundle(IDiagnosticsCollector collector, AmendBundle if (!string.IsNullOrWhiteSpace(outputDir) && !_fileSystem.Directory.Exists(outputDir)) _ = _fileSystem.Directory.CreateDirectory(outputDir); - await _fileSystem.File.WriteAllTextAsync(amendFilePath, yaml, Encoding.UTF8, ctx); + // Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility + var normalizedYaml = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yaml); + await _fileSystem.File.WriteAllTextAsync(amendFilePath, normalizedYaml, Utf8NoBom, ctx); _logger.LogInformation("Created amend file: {AmendFilePath} with {Count} entries", amendFilePath, entries.Count); return true; diff --git a/src/services/Elastic.Changelog/Bundling/ChangelogBundlingService.cs b/src/services/Elastic.Changelog/Bundling/ChangelogBundlingService.cs index 8747f2f39..bd1adbe4e 100644 --- a/src/services/Elastic.Changelog/Bundling/ChangelogBundlingService.cs +++ b/src/services/Elastic.Changelog/Bundling/ChangelogBundlingService.cs @@ -9,6 +9,7 @@ using Elastic.Changelog.Configuration; using Elastic.Changelog.GitHub; using Elastic.Changelog.Rendering; +using Elastic.Changelog.Utilities; using Elastic.Documentation; using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Assembler; @@ -137,6 +138,11 @@ public partial class ChangelogBundlingService( ? new ChangelogConfigurationLoader(logFactory, configurationContext, fileSystem ?? FileSystemFactory.RealRead) : null; + /// + /// UTF-8 encoding without BOM for writing YAML files. + /// + private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false); + [GeneratedRegex(@"(\s+)version:", RegexOptions.Multiline)] internal static partial Regex VersionToTargetRegex(); @@ -766,7 +772,9 @@ private async Task WriteBundleFileAsync(Bundle bundledData, string outputPath, C } // Write bundled file with explicit UTF-8 encoding to ensure proper character handling - await _fileSystem.File.WriteAllTextAsync(outputPath, bundledYaml, Encoding.UTF8, ctx); + // Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility + var normalizedYaml = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(bundledYaml); + await _fileSystem.File.WriteAllTextAsync(outputPath, normalizedYaml, Utf8NoBom, ctx); _logger.LogInformation("Created bundled changelog: {OutputPath}", outputPath); } diff --git a/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs b/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs index ece1e60c7..8c40eac84 100644 --- a/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs +++ b/src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs @@ -4,6 +4,7 @@ using System.IO.Abstractions; using System.Text; +using Elastic.Changelog.Utilities; using Elastic.Documentation; using Elastic.Documentation.Configuration.Changelog; using Elastic.Documentation.Configuration.ReleaseNotes; @@ -18,6 +19,10 @@ namespace Elastic.Changelog.Creation; /// public class ChangelogFileWriter(IFileSystem fileSystem, ILogger logger) { + /// + /// UTF-8 encoding without BOM for writing YAML files. + /// + private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false); /// /// Writes a changelog file with the given data. /// @@ -46,8 +51,9 @@ public async Task WriteChangelogAsync( var filename = GenerateFilename(collector, input); var filePath = fileSystem.Path.Join(outputDir, filename); - // Write file with explicit UTF-8 encoding to ensure proper character handling - await fileSystem.File.WriteAllTextAsync(filePath, yamlContent, Encoding.UTF8, ctx); + // Write UTF-8 text without BOM using explicit encoding instance. + var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent); + await fileSystem.File.WriteAllTextAsync(filePath, normalizedContent, Utf8NoBom, ctx); logger.LogInformation("Created changelog fragment: {FilePath}", filePath); return true; diff --git a/src/services/Elastic.Changelog/Evaluation/ChangelogPrepareArtifactService.cs b/src/services/Elastic.Changelog/Evaluation/ChangelogPrepareArtifactService.cs index cf0305b29..a7b768073 100644 --- a/src/services/Elastic.Changelog/Evaluation/ChangelogPrepareArtifactService.cs +++ b/src/services/Elastic.Changelog/Evaluation/ChangelogPrepareArtifactService.cs @@ -3,9 +3,11 @@ // See the LICENSE file in the project root for more information using System.IO.Abstractions; +using System.Text; using System.Text.Json; using Actions.Core.Services; using Elastic.Changelog.Configuration; +using Elastic.Changelog.Utilities; using Elastic.Documentation.Configuration; using Elastic.Documentation.Diagnostics; using Elastic.Documentation.Services; @@ -21,6 +23,11 @@ public class ChangelogPrepareArtifactService( IFileSystem? fileSystem = null ) : IService { + /// + /// UTF-8 encoding without BOM for writing YAML files. + /// + private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false); + private readonly ILogger _logger = logFactory.CreateLogger(); private readonly IFileSystem _fileSystem = fileSystem ?? new FileSystem(); private readonly ChangelogConfigurationLoader _configLoader = new(logFactory, configurationContext, fileSystem ?? new FileSystem()); @@ -48,8 +55,11 @@ public async Task PrepareArtifact(IDiagnosticsCollector collector, Prepare _logger.LogInformation("Reusing existing filename {Filename} for stable path on branch", changelogFilename); var destYaml = _fileSystem.Path.Combine(input.OutputDir, changelogFilename); - _fileSystem.File.Copy(sourceYaml, destYaml, overwrite: true); - _logger.LogInformation("Copied changelog YAML: {Source} → {Dest}", sourceYaml, destYaml); + // Read YAML, normalize to remove any BOM, then write UTF-8 bytes without BOM (avoids provider-specific WriteAllText preamble behavior). + var yamlContent = await _fileSystem.File.ReadAllTextAsync(sourceYaml, ctx); + var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent); + await _fileSystem.File.WriteAllTextAsync(destYaml, normalizedContent, Utf8NoBom, ctx); + _logger.LogInformation("Normalized and copied changelog YAML: {Source} → {Dest}", sourceYaml, destYaml); } else { diff --git a/src/services/Elastic.Changelog/GithubRelease/GitHubReleaseChangelogService.cs b/src/services/Elastic.Changelog/GithubRelease/GitHubReleaseChangelogService.cs index 958c74bcf..2351414d4 100644 --- a/src/services/Elastic.Changelog/GithubRelease/GitHubReleaseChangelogService.cs +++ b/src/services/Elastic.Changelog/GithubRelease/GitHubReleaseChangelogService.cs @@ -8,6 +8,7 @@ using Elastic.Changelog.Bundling; using Elastic.Changelog.Configuration; using Elastic.Changelog.GitHub; +using Elastic.Changelog.Utilities; using Elastic.Documentation; using Elastic.Documentation.Configuration; using Elastic.Documentation.Configuration.Changelog; @@ -86,6 +87,11 @@ public class GitHubReleaseChangelogService( ChangelogBundlingService? bundlingService = null ) : IService { + /// + /// UTF-8 encoding without BOM for writing YAML files. + /// + private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false); + private readonly ILogger _logger = logFactory.CreateLogger(); private readonly IFileSystem _fileSystem = fileSystem ?? FileSystemFactory.RealRead; private readonly ChangelogConfigurationLoader _configLoader = new(logFactory, configurationContext, fileSystem ?? FileSystemFactory.RealRead); @@ -301,7 +307,9 @@ private async Task ProcessPrReference( var slug = ChangelogTextUtilities.GenerateSlug(title); var filename = $"{prRef.PrNumber}-{finalType.ToStringFast(true)}-{slug}.yaml"; var filePath = _fileSystem.Path.Join(outputDir, filename); - await _fileSystem.File.WriteAllTextAsync(filePath, yamlContent, Encoding.UTF8, ctx); + // Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility + var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent); + await _fileSystem.File.WriteAllTextAsync(filePath, normalizedContent, Utf8NoBom, ctx); createdFiles.Add(filename); _logger.LogDebug("Created changelog: {FilePath}", filePath); diff --git a/src/services/Elastic.Changelog/Utilities/ChangelogUtf8Normalization.cs b/src/services/Elastic.Changelog/Utilities/ChangelogUtf8Normalization.cs new file mode 100644 index 000000000..d844cfe05 --- /dev/null +++ b/src/services/Elastic.Changelog/Utilities/ChangelogUtf8Normalization.cs @@ -0,0 +1,43 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System; +using Elastic.Documentation.Text; + +namespace Elastic.Changelog.Utilities; + +/// +/// Utilities for normalizing UTF-8 encoding in changelog YAML files. +/// Ensures YAML output is UTF-8 without BOM for better tooling compatibility and review ergonomics. +/// This class now serves as a thin forwarder to the shared UTF-8 text normalization utilities. +/// +public static class ChangelogUtf8Normalization +{ + /// + /// UTF-8 Byte Order Mark character (U+FEFF). + /// + public const char Utf8BomChar = Utf8TextNormalization.Utf8BomChar; + + /// + /// UTF-8 Byte Order Mark as byte sequence (EF BB BF). + /// + public static readonly byte[] Utf8BomBytes = Utf8TextNormalization.Utf8BomBytes; + + /// + /// Strips the leading UTF-8 BOM character from a string if present. + /// YAML should be UTF-8 without BOM for tooling and review ergonomics. + /// + /// The text to normalize + /// Text with leading BOM character removed if it was present + public static string StripLeadingUtf8BomChar(string text) => + Utf8TextNormalization.StripLeadingUtf8Bom(text)!; + + /// + /// Checks if a byte span starts with the UTF-8 BOM sequence (EF BB BF). + /// + /// The byte span to check + /// True if the span starts with UTF-8 BOM bytes + public static bool HasUtf8Bom(ReadOnlySpan bytes) => + Utf8TextNormalization.HasUtf8Bom(bytes); +} diff --git a/src/tooling/docs-builder/Commands/ChangelogCommand.cs b/src/tooling/docs-builder/Commands/ChangelogCommand.cs index e81b88a2f..ba585f8d5 100644 --- a/src/tooling/docs-builder/Commands/ChangelogCommand.cs +++ b/src/tooling/docs-builder/Commands/ChangelogCommand.cs @@ -18,6 +18,7 @@ using Elastic.Changelog.GithubRelease; using Elastic.Changelog.Rendering; using Elastic.Changelog.Uploading; +using Elastic.Changelog.Utilities; using Elastic.Documentation.Configuration; using Elastic.Documentation.Diagnostics; using Elastic.Documentation.ReleaseNotes; @@ -162,6 +163,8 @@ public Task Init( try { var content = _fileSystem.File.ReadAllText(configPath); + // Strip any leading BOM that might be present after reading + content = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(content); if (useNonDefaultChangelogDir) { @@ -175,7 +178,9 @@ public Task Init( content = BundleOutputDirectoryRegex().Replace(content, "$1" + outputValue); } - _fileSystem.File.WriteAllText(configPath, content); + // Ensure normalized content is written without BOM + var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(content); + _fileSystem.File.WriteAllText(configPath, normalizedContent); _logger.LogInformation("Updated bundle paths in changelog configuration: {ConfigPath}", configPath); } catch (IOException ex) diff --git a/tests/Elastic.Changelog.Tests/Changelogs/BundleChangelogsTests.cs b/tests/Elastic.Changelog.Tests/Changelogs/BundleChangelogsTests.cs index 74d4f89c4..7e53004ba 100644 --- a/tests/Elastic.Changelog.Tests/Changelogs/BundleChangelogsTests.cs +++ b/tests/Elastic.Changelog.Tests/Changelogs/BundleChangelogsTests.cs @@ -5,6 +5,7 @@ using System.Text; using AwesomeAssertions; using Elastic.Changelog.Bundling; +using Elastic.Changelog.Utilities; using Elastic.Documentation.Configuration; using Elastic.Documentation.Diagnostics; using Microsoft.Extensions.Logging.Abstractions; @@ -6124,6 +6125,59 @@ await FileSystem.File.WriteAllTextAsync(configPath, bundleContent.Should().Contain("release-date:", "release date should be auto-populated when bundle.release_dates is true"); } + [Fact] + public async Task BundleChangelogs_WithBomPrefixedInput_ProducesNormalizedOutput() + { + // Arrange - Create changelog with BOM prefix + // language=yaml + var changelogContent = + """ + title: Test changelog with BOM + type: feature + products: + - product: elasticsearch + target: 9.2.0 + lifecycle: ga + prs: + - https://github.com/elastic/elasticsearch/pull/123 + """; + + // Add UTF-8 BOM to the content + var contentWithBom = ChangelogUtf8Normalization.Utf8BomChar + changelogContent; + var changelogFile = FileSystem.Path.Join(_changelogDir, "changelog-with-bom.yaml"); + + // Write the file with BOM using explicit encoding + await FileSystem.File.WriteAllTextAsync(changelogFile, contentWithBom, Encoding.UTF8, TestContext.Current.CancellationToken); + + // Verify the source file has BOM by reading as bytes + var sourceBytes = await FileSystem.File.ReadAllBytesAsync(changelogFile, TestContext.Current.CancellationToken); + ChangelogUtf8Normalization.HasUtf8Bom(sourceBytes).Should().BeTrue("source file should contain BOM"); + + var outputPath = FileSystem.Path.Join(Paths.WorkingDirectoryRoot.FullName, Guid.NewGuid().ToString(), "bundle.yaml"); + var input = new BundleChangelogsArguments + { + Directory = _changelogDir, + All = true, + Output = outputPath + }; + + // Act + var result = await Service.BundleChangelogs(Collector, input, TestContext.Current.CancellationToken); + + // Assert + result.Should().BeTrue("bundling should succeed"); + Collector.Errors.Should().Be(0); + + // Verify output file does not contain BOM + var outputBytes = await FileSystem.File.ReadAllBytesAsync(outputPath, TestContext.Current.CancellationToken); + ChangelogUtf8Normalization.HasUtf8Bom(outputBytes).Should().BeFalse("bundled output should not contain UTF-8 BOM"); + + // Verify content refs (bundle uses file refs + checksum unless resolve inlines entries) + var bundleContent = await FileSystem.File.ReadAllTextAsync(outputPath, TestContext.Current.CancellationToken); + bundleContent.Should().Contain("changelog-with-bom.yaml"); + bundleContent.Should().Contain("entries:"); + } + private void CreateSampleChangelogs() { // language=yaml diff --git a/tests/Elastic.Changelog.Tests/Creation/ChangelogCreationServiceTests.cs b/tests/Elastic.Changelog.Tests/Creation/ChangelogCreationServiceTests.cs index 386cfd47e..b4f7e4fc3 100644 --- a/tests/Elastic.Changelog.Tests/Creation/ChangelogCreationServiceTests.cs +++ b/tests/Elastic.Changelog.Tests/Creation/ChangelogCreationServiceTests.cs @@ -3,10 +3,12 @@ // See the LICENSE file in the project root for more information using System.IO.Abstractions.TestingHelpers; +using System.Text; using AwesomeAssertions; using Elastic.Changelog.Creation; using Elastic.Changelog.GitHub; using Elastic.Changelog.Tests.Changelogs; +using Elastic.Changelog.Utilities; using Elastic.Documentation.Configuration; using FakeItEasy; @@ -230,4 +232,43 @@ public async Task CreateChangelog_TempOutputDirectory_Succeeds() writeFs.Directory.Exists(tempOutput).Should().BeTrue(); writeFs.Directory.GetFiles(tempOutput, "*.yaml").Should().NotBeEmpty(); } + + [Fact] + public async Task CreateChangelog_OutputDoesNotContainBom() + { + await WriteConfig(ConfigWithProductLabels); + var tempOutput = Path.Join(Paths.WorkingDirectoryRoot.FullName, Guid.NewGuid().ToString()); + FileSystem.Directory.CreateDirectory(tempOutput); + + var service = new ChangelogCreationService(LoggerFactory, ConfigurationContext, _mockGitHub, FileSystem, null); + var input = new CreateChangelogArguments + { + Title = "Test BOM handling", + Type = "feature", + Products = [new ProductArgument { Product = "elasticsearch", Target = "9.1.0", Lifecycle = "ga" }], + Config = Path.Join(Paths.WorkingDirectoryRoot.FullName, "config", "changelog.yml"), + Output = tempOutput, + Concise = true + }; + + // Act + var result = await service.CreateChangelog(Collector, input, TestContext.Current.CancellationToken); + + // Assert + result.Should().BeTrue("changelog creation should succeed"); + Collector.Errors.Should().Be(0); + + // Verify created file does not contain BOM + var yamlFiles = FileSystem.Directory.GetFiles(tempOutput, "*.yaml"); + yamlFiles.Should().NotBeEmpty("should create a YAML file"); + + var yamlFile = yamlFiles[0]; + var bytes = await FileSystem.File.ReadAllBytesAsync(yamlFile, TestContext.Current.CancellationToken); + ChangelogUtf8Normalization.HasUtf8Bom(bytes).Should().BeFalse("created changelog should not contain UTF-8 BOM"); + + // Verify content is correct + var content = await FileSystem.File.ReadAllTextAsync(yamlFile, TestContext.Current.CancellationToken); + content.Should().Contain("Test BOM handling"); + content.Should().Contain("type: feature"); + } } diff --git a/tests/Elastic.Changelog.Tests/Evaluation/ChangelogPrepareArtifactServiceTests.cs b/tests/Elastic.Changelog.Tests/Evaluation/ChangelogPrepareArtifactServiceTests.cs index 23ec00ba7..6e8450bba 100644 --- a/tests/Elastic.Changelog.Tests/Evaluation/ChangelogPrepareArtifactServiceTests.cs +++ b/tests/Elastic.Changelog.Tests/Evaluation/ChangelogPrepareArtifactServiceTests.cs @@ -2,11 +2,13 @@ // Elasticsearch B.V licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information +using System.Text; using System.Text.Json; using Actions.Core.Services; using AwesomeAssertions; using Elastic.Changelog.Evaluation; using Elastic.Changelog.Tests.Changelogs; +using Elastic.Changelog.Utilities; using Elastic.Documentation.Configuration; using Elastic.Documentation.ReleaseNotes; using FakeItEasy; @@ -242,6 +244,59 @@ public async Task PrepareArtifact_MissingStagingYaml_StatusError() metadata.Status.Should().Be("error"); } + [Fact] + public async Task PrepareArtifact_WithBomPrefixedYaml_NormalizesOutput() + { + // Arrange + await SetupConfig(); + FileSystem.Directory.CreateDirectory(StagingDir); + + // Create YAML with BOM prefix + const string yamlContent = """ + title: Test changelog + type: feature + products: + - product: elasticsearch + target: 9.1.0 + lifecycle: ga + """; + + var contentWithBom = ChangelogUtf8Normalization.Utf8BomChar + yamlContent; + var stagingYaml = Path.Join(StagingDir, "changelog.yaml"); + await FileSystem.File.WriteAllTextAsync(stagingYaml, contentWithBom, Encoding.UTF8, CancellationToken.None); + + // Verify staging file has BOM + var stagingBytes = await FileSystem.File.ReadAllBytesAsync(stagingYaml, CancellationToken.None); + ChangelogUtf8Normalization.HasUtf8Bom(stagingBytes).Should().BeTrue("staging file should contain BOM"); + + var service = CreateService(); + var args = DefaultArgs() with + { + EvaluateStatus = "proceed", + GenerateOutcome = "success" + }; + + // Act + await service.PrepareArtifact(Collector, args, CancellationToken.None); + + // Assert + var outputYaml = Path.Join(OutputDir, "changelog.yaml"); + FileSystem.File.Exists(outputYaml).Should().BeTrue("output YAML file should exist"); + + // Verify output file does not contain BOM + var outputBytes = await FileSystem.File.ReadAllBytesAsync(outputYaml, CancellationToken.None); + ChangelogUtf8Normalization.HasUtf8Bom(outputBytes).Should().BeFalse("output file should not contain UTF-8 BOM"); + + // Verify content is preserved + var outputContent = await FileSystem.File.ReadAllTextAsync(outputYaml, CancellationToken.None); + outputContent.Should().Contain("Test changelog"); + outputContent.Should().Contain("type: feature"); + + var metadata = ReadMetadata(); + metadata.Status.Should().Be("success"); + metadata.ChangelogFilename.Should().Be("changelog.yaml"); + } + [Theory] [InlineData("proceed", "success", PrEvaluationResult.Success)] [InlineData("proceed", "failure", PrEvaluationResult.Error)] diff --git a/tests/Elastic.Changelog.Tests/Utilities/ChangelogUtf8NormalizationTests.cs b/tests/Elastic.Changelog.Tests/Utilities/ChangelogUtf8NormalizationTests.cs new file mode 100644 index 000000000..ce94db676 --- /dev/null +++ b/tests/Elastic.Changelog.Tests/Utilities/ChangelogUtf8NormalizationTests.cs @@ -0,0 +1,155 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using AwesomeAssertions; +using Elastic.Changelog.Utilities; + +namespace Elastic.Changelog.Tests.Utilities; + +public class ChangelogUtf8NormalizationTests +{ + [Fact] + public void StripLeadingUtf8BomChar_EmptyString_ReturnsEmpty() + { + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(string.Empty); + + result.Should().Be(string.Empty); + } + + [Fact] + public void StripLeadingUtf8BomChar_NullString_ReturnsNull() + { + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(null!); + + result.Should().BeNull(); + } + + [Fact] + public void StripLeadingUtf8BomChar_StringWithoutBom_ReturnsUnchanged() + { + const string input = "type: feature\ntitle: Test"; + + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(input); + + result.Should().Be(input); + } + + [Fact] + public void StripLeadingUtf8BomChar_StringWithLeadingBom_RemovesBom() + { + const string content = "type: feature\ntitle: Test"; + var input = ChangelogUtf8Normalization.Utf8BomChar + content; + + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(input); + + result.Should().Be(content); + } + + [Fact] + public void StripLeadingUtf8BomChar_StringOnlyBom_ReturnsEmpty() + { + var input = ChangelogUtf8Normalization.Utf8BomChar.ToString(); + + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(input); + + result.Should().Be(string.Empty); + } + + [Fact] + public void StripLeadingUtf8BomChar_StringWithBomInMiddle_DoesNotChange() + { + var input = $"type: feature{ChangelogUtf8Normalization.Utf8BomChar}title: Test"; + + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(input); + + result.Should().Be(input); + } + + [Fact] + public void StripLeadingUtf8BomChar_StringWithConsecutiveLeadingBoms_RemovesAllLeadingBoms() + { + const string content = "type: feature\ntitle: Test"; + // Two consecutive BOM characters at the start + var input = ChangelogUtf8Normalization.Utf8BomChar.ToString() + + ChangelogUtf8Normalization.Utf8BomChar + content; + + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(input); + + result.Should().Be(content); + } + + [Fact] + public void StripLeadingUtf8BomChar_StringWithThreeConsecutiveLeadingBoms_RemovesAllLeadingBoms() + { + const string content = "type: feature\ntitle: Test"; + // Three consecutive BOM characters at the start (edge case test) + var input = ChangelogUtf8Normalization.Utf8BomChar.ToString() + + ChangelogUtf8Normalization.Utf8BomChar + + ChangelogUtf8Normalization.Utf8BomChar + content; + + var result = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(input); + + result.Should().Be(content); + } + + [Fact] + public void HasUtf8Bom_EmptySpan_ReturnsFalse() + { + var bytes = ReadOnlySpan.Empty; + + var result = ChangelogUtf8Normalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void HasUtf8Bom_TooShortSpan_ReturnsFalse() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB]); + + var result = ChangelogUtf8Normalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void HasUtf8Bom_ValidBomBytes_ReturnsTrue() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB, 0xBF, 0x74, 0x79]); + + var result = ChangelogUtf8Normalization.HasUtf8Bom(bytes); + + result.Should().BeTrue(); + } + + [Fact] + public void HasUtf8Bom_ExactBomBytes_ReturnsTrue() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB, 0xBF]); + + var result = ChangelogUtf8Normalization.HasUtf8Bom(bytes); + + result.Should().BeTrue(); + } + + [Fact] + public void HasUtf8Bom_InvalidBomBytes_ReturnsFalse() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB, 0xBE, 0x74, 0x79]); + + var result = ChangelogUtf8Normalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void HasUtf8Bom_NormalYamlBytes_ReturnsFalse() + { + var bytes = new ReadOnlySpan([0x74, 0x79, 0x70, 0x65]); + + var result = ChangelogUtf8Normalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } +} diff --git a/tests/Elastic.Documentation.Configuration.Tests/Text/Utf8TextNormalizationTests.cs b/tests/Elastic.Documentation.Configuration.Tests/Text/Utf8TextNormalizationTests.cs new file mode 100644 index 000000000..576f92ca7 --- /dev/null +++ b/tests/Elastic.Documentation.Configuration.Tests/Text/Utf8TextNormalizationTests.cs @@ -0,0 +1,176 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using AwesomeAssertions; +using Elastic.Documentation.Text; + +namespace Elastic.Documentation.Configuration.Tests.Text; + +public class Utf8TextNormalizationTests +{ + [Fact] + public void StripLeadingUtf8Bom_EmptyString_ReturnsEmpty() + { + var result = Utf8TextNormalization.StripLeadingUtf8Bom(string.Empty); + + result.Should().Be(string.Empty); + } + + [Fact] + public void StripLeadingUtf8Bom_NullString_ReturnsNull() + { + var result = Utf8TextNormalization.StripLeadingUtf8Bom(null); + + result.Should().BeNull(); + } + + [Fact] + public void StripLeadingUtf8Bom_StringWithoutBom_ReturnsUnchanged() + { + const string input = "type: feature\ntitle: Test changelog entry"; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().BeSameAs(input); // Should return the same instance for efficiency + } + + [Fact] + public void StripLeadingUtf8Bom_StringWithSingleLeadingBom_RemovesBom() + { + const string content = "type: feature\ntitle: Test changelog entry"; + var input = Utf8TextNormalization.Utf8BomChar + content; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().Be(content); + } + + [Fact] + public void StripLeadingUtf8Bom_StringWithTwoConsecutiveLeadingBoms_RemovesBothBoms() + { + const string content = "type: feature\ntitle: Test changelog entry"; + var input = Utf8TextNormalization.Utf8BomChar.ToString() + + Utf8TextNormalization.Utf8BomChar + content; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().Be(content); + } + + [Fact] + public void StripLeadingUtf8Bom_StringWithThreeConsecutiveLeadingBoms_RemovesAllBoms() + { + const string content = "type: feature\ntitle: Test changelog entry"; + var input = Utf8TextNormalization.Utf8BomChar.ToString() + + Utf8TextNormalization.Utf8BomChar + + Utf8TextNormalization.Utf8BomChar + content; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().Be(content); + } + + [Fact] + public void StripLeadingUtf8Bom_StringOnlyBoms_ReturnsEmpty() + { + var input = Utf8TextNormalization.Utf8BomChar.ToString() + + Utf8TextNormalization.Utf8BomChar; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().Be(string.Empty); + } + + [Fact] + public void StripLeadingUtf8Bom_StringWithBomInMiddle_DoesNotStripMiddleBom() + { + var input = $"type: feature{Utf8TextNormalization.Utf8BomChar}title: Test"; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().Be(input); + } + + [Fact] + public void StripLeadingUtf8Bom_StringWithBomAtEnd_DoesNotStripEndBom() + { + var input = $"type: feature\ntitle: Test{Utf8TextNormalization.Utf8BomChar}"; + + var result = Utf8TextNormalization.StripLeadingUtf8Bom(input); + + result.Should().Be(input); + } + + [Fact] + public void HasUtf8Bom_EmptySpan_ReturnsFalse() + { + var bytes = ReadOnlySpan.Empty; + + var result = Utf8TextNormalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void HasUtf8Bom_TooShortSpan_ReturnsFalse() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB]); + + var result = Utf8TextNormalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void HasUtf8Bom_ValidBomBytes_ReturnsTrue() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB, 0xBF, 0x74, 0x79]); + + var result = Utf8TextNormalization.HasUtf8Bom(bytes); + + result.Should().BeTrue(); + } + + [Fact] + public void HasUtf8Bom_ExactBomBytes_ReturnsTrue() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB, 0xBF]); + + var result = Utf8TextNormalization.HasUtf8Bom(bytes); + + result.Should().BeTrue(); + } + + [Fact] + public void HasUtf8Bom_InvalidBomBytes_ReturnsFalse() + { + var bytes = new ReadOnlySpan([0xEF, 0xBB, 0xBE, 0x74, 0x79]); + + var result = Utf8TextNormalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void HasUtf8Bom_NormalTextBytes_ReturnsFalse() + { + var bytes = new ReadOnlySpan([0x74, 0x79, 0x70, 0x65]); + + var result = Utf8TextNormalization.HasUtf8Bom(bytes); + + result.Should().BeFalse(); + } + + [Fact] + public void Utf8BomChar_MatchesExpectedValue() + { + Utf8TextNormalization.Utf8BomChar.Should().Be('\uFEFF'); + } + + [Fact] + public void Utf8BomBytes_MatchesExpectedSequence() + { + Utf8TextNormalization.Utf8BomBytes.Should().Equal([0xEF, 0xBB, 0xBF]); + } +}