Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public static partial class ReleaseNotesSerialization
/// </summary>
public static ChangelogEntry DeserializeEntry(string yaml)
{
yaml = StripLeadingUtf8BomChar(yaml);
var yamlDto = YamlDeserializer.Deserialize<ChangelogEntryDto>(yaml);
return ToEntry(yamlDto);
}
Expand All @@ -71,6 +72,7 @@ public static ChangelogEntry DeserializeEntry(string yaml)
/// </summary>
public static Bundle DeserializeBundle(string yaml)
{
yaml = StripLeadingUtf8BomChar(yaml);
var yamlDto = YamlDeserializer.Deserialize<BundleDto>(yaml);
return ToBundle(yamlDto);
}
Expand Down Expand Up @@ -364,6 +366,7 @@ private static ChangelogEntryType ParseEntryType(string? value)
/// <returns>The normalized YAML content.</returns>
public static string NormalizeYaml(string yaml)
{
yaml = StripLeadingUtf8BomChar(yaml);
// Skip comment lines
var yamlLines = yaml.Split('\n');
var yamlWithoutComments = string.Join('\n', yamlLines.Where(line => !line.TrimStart().StartsWith('#')));
Expand All @@ -372,6 +375,13 @@ public static string NormalizeYaml(string yaml)
return VersionToTargetRegex().Replace(yamlWithoutComments, "$1target:");
}

private static string StripLeadingUtf8BomChar(string text)
{
if (string.IsNullOrEmpty(text))
return text;
return text[0] == '\uFEFF' ? text[1..] : text;
}

/// <summary>
/// Loads the publish blocker configuration from a changelog.yml file.
/// Uses AOT-compatible deserialization via YamlStaticContext.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Text;
using System.Text.RegularExpressions;
using Elastic.Changelog.Configuration;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.Assembler;
using Elastic.Documentation.Configuration.Changelog;
Expand Down Expand Up @@ -256,7 +257,9 @@ public async Task<bool> AmendBundle(IDiagnosticsCollector collector, AmendBundle
if (!string.IsNullOrWhiteSpace(outputDir) && !_fileSystem.Directory.Exists(outputDir))
_ = _fileSystem.Directory.CreateDirectory(outputDir);

await _fileSystem.File.WriteAllTextAsync(amendFilePath, yaml, Encoding.UTF8, ctx);
// Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility
var normalizedYaml = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yaml);
await _fileSystem.File.WriteAllBytesAsync(amendFilePath, Encoding.UTF8.GetBytes(normalizedYaml), ctx);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we can simply do WriteAllText with new UTF8Encoding(false) instead of Encoding.UTF8.

That way we allocate less strings, see also: https://danielwertheim.se/utf-8-bom-adventures-in-c/

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

460f419 is my attempt to address this comment as well as #3243 (review)

The AI summary for that change is as follows:

Summary of Changes in 460f419

This commit addresses PR review feedback by consolidating BOM helper duplication and optimizing UTF-8 write style:

🔄 Consolidated BOM/String Normalization

  • Created shared utility: New Utf8TextNormalization.cs in Elastic.Documentation/Text/ with enhanced BOM stripping that removes all consecutive leading U+FEFF characters (handles double-BOM scenarios)
  • Updated ReleaseNotesSerialization.cs: Removed private StripLeadingUtf8BomChar method, now uses shared Utf8TextNormalization.StripLeadingUtf8Bom
  • Converted ChangelogUtf8Normalization.cs: Now serves as a thin forwarder to the shared implementation, maintaining backward compatibility

Optimized UTF-8 File Writes

Replaced WriteAllBytesAsync(path, Encoding.UTF8.GetBytes(content)) with WriteAllTextAsync(path, content, Utf8NoBom) in all changelog writers:

  • ChangelogFileWriter.cs
  • ChangelogBundlingService.cs
  • ChangelogBundleAmendService.cs
  • GitHubReleaseChangelogService.cs
  • ChangelogPrepareArtifactService.cs

Each now uses static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false) to avoid per-write byte array allocations.

Enhanced Testing

  • New test file: Utf8TextNormalizationTests.cs with comprehensive shared utility tests
  • Enhanced existing tests: Added consecutive BOM test cases in ChangelogUtf8NormalizationTests.cs
  • Edge case coverage: Tests for multiple BOMs, BOM-only strings, and BOMs in middle/end positions

The changes eliminate code duplication, improve memory efficiency, and provide more robust BOM handling across the codebase.

_logger.LogInformation("Created amend file: {AmendFilePath} with {Count} entries", amendFilePath, entries.Count);

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Elastic.Changelog.Configuration;
using Elastic.Changelog.GitHub;
using Elastic.Changelog.Rendering;
using Elastic.Changelog.Utilities;
using Elastic.Documentation;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.Assembler;
Expand Down Expand Up @@ -766,7 +767,9 @@ private async Task WriteBundleFileAsync(Bundle bundledData, string outputPath, C
}

// Write bundled file with explicit UTF-8 encoding to ensure proper character handling
await _fileSystem.File.WriteAllTextAsync(outputPath, bundledYaml, Encoding.UTF8, ctx);
// Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility
var normalizedYaml = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(bundledYaml);
await _fileSystem.File.WriteAllBytesAsync(outputPath, Encoding.UTF8.GetBytes(normalizedYaml), ctx);
_logger.LogInformation("Created bundled changelog: {OutputPath}", outputPath);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System.IO.Abstractions;
using System.Text;
using Elastic.Changelog.Utilities;
using Elastic.Documentation;
using Elastic.Documentation.Configuration.Changelog;
using Elastic.Documentation.Configuration.ReleaseNotes;
Expand Down Expand Up @@ -46,8 +47,9 @@ public async Task<bool> WriteChangelogAsync(
var filename = GenerateFilename(collector, input);
var filePath = fileSystem.Path.Join(outputDir, filename);

// Write file with explicit UTF-8 encoding to ensure proper character handling
await fileSystem.File.WriteAllTextAsync(filePath, yamlContent, Encoding.UTF8, ctx);
// Write UTF-8 bytes without BOM (GetBytes never emits a preamble; avoids provider-specific WriteAllText behavior).
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent);
await fileSystem.File.WriteAllBytesAsync(filePath, Encoding.UTF8.GetBytes(normalizedContent), ctx);
logger.LogInformation("Created changelog fragment: {FilePath}", filePath);

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
// See the LICENSE file in the project root for more information

using System.IO.Abstractions;
using System.Text;
using System.Text.Json;
using Actions.Core.Services;
using Elastic.Changelog.Configuration;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Diagnostics;
using Elastic.Documentation.Services;
Expand Down Expand Up @@ -48,8 +50,12 @@ public async Task<bool> PrepareArtifact(IDiagnosticsCollector collector, Prepare
_logger.LogInformation("Reusing existing filename {Filename} for stable path on branch", changelogFilename);

var destYaml = _fileSystem.Path.Combine(input.OutputDir, changelogFilename);
_fileSystem.File.Copy(sourceYaml, destYaml, overwrite: true);
_logger.LogInformation("Copied changelog YAML: {Source} → {Dest}", sourceYaml, destYaml);
// Read YAML, normalize to remove any BOM, then write UTF-8 bytes without BOM (avoids provider-specific WriteAllText preamble behavior).
var yamlContent = await _fileSystem.File.ReadAllTextAsync(sourceYaml, ctx);
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent);
var utf8Bytes = Encoding.UTF8.GetBytes(normalizedContent);
await _fileSystem.File.WriteAllBytesAsync(destYaml, utf8Bytes, ctx);
_logger.LogInformation("Normalized and copied changelog YAML: {Source} → {Dest}", sourceYaml, destYaml);
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using Elastic.Changelog.Bundling;
using Elastic.Changelog.Configuration;
using Elastic.Changelog.GitHub;
using Elastic.Changelog.Utilities;
using Elastic.Documentation;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.Changelog;
Expand Down Expand Up @@ -301,7 +302,9 @@ private async Task<bool> ProcessPrReference(
var slug = ChangelogTextUtilities.GenerateSlug(title);
var filename = $"{prRef.PrNumber}-{finalType.ToStringFast(true)}-{slug}.yaml";
var filePath = _fileSystem.Path.Join(outputDir, filename);
await _fileSystem.File.WriteAllTextAsync(filePath, yamlContent, Encoding.UTF8, ctx);
// Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent);
await _fileSystem.File.WriteAllBytesAsync(filePath, Encoding.UTF8.GetBytes(normalizedContent), ctx);

createdFiles.Add(filename);
_logger.LogDebug("Created changelog: {FilePath}", filePath);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System;

namespace Elastic.Changelog.Utilities;

/// <summary>
/// Utilities for normalizing UTF-8 encoding in changelog YAML files.
/// Ensures YAML output is UTF-8 without BOM for better tooling compatibility and review ergonomics.
/// </summary>
public static class ChangelogUtf8Normalization
{
/// <summary>
/// UTF-8 Byte Order Mark character (U+FEFF).
/// </summary>
public const char Utf8BomChar = '\uFEFF';

/// <summary>
/// UTF-8 Byte Order Mark as byte sequence (EF BB BF).
/// </summary>
public static readonly byte[] Utf8BomBytes = [0xEF, 0xBB, 0xBF];

/// <summary>
/// Strips the leading UTF-8 BOM character from a string if present.
/// YAML should be UTF-8 without BOM for tooling and review ergonomics.
/// </summary>
/// <param name="text">The text to normalize</param>
/// <returns>Text with leading BOM character removed if it was present</returns>
public static string StripLeadingUtf8BomChar(string text)
{
if (string.IsNullOrEmpty(text))
return text;

return text.StartsWith(Utf8BomChar) ? text.Substring(1) : text;
}

/// <summary>
/// Checks if a byte span starts with the UTF-8 BOM sequence (EF BB BF).
/// </summary>
/// <param name="bytes">The byte span to check</param>
/// <returns>True if the span starts with UTF-8 BOM bytes</returns>
public static bool HasUtf8Bom(ReadOnlySpan<byte> bytes) => bytes.Length >= 3 &&
bytes[0] == 0xEF &&
bytes[1] == 0xBB &&
bytes[2] == 0xBF;
}
7 changes: 6 additions & 1 deletion src/tooling/docs-builder/Commands/ChangelogCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
using Elastic.Changelog.GithubRelease;
using Elastic.Changelog.Rendering;
using Elastic.Changelog.Uploading;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Diagnostics;
using Elastic.Documentation.ReleaseNotes;
Expand Down Expand Up @@ -162,6 +163,8 @@ public Task<int> Init(
try
{
var content = _fileSystem.File.ReadAllText(configPath);
// Strip any leading BOM that might be present after reading
content = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(content);

if (useNonDefaultChangelogDir)
{
Expand All @@ -175,7 +178,9 @@ public Task<int> Init(
content = BundleOutputDirectoryRegex().Replace(content, "$1" + outputValue);
}

_fileSystem.File.WriteAllText(configPath, content);
// Ensure normalized content is written without BOM
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(content);
_fileSystem.File.WriteAllText(configPath, normalizedContent);
_logger.LogInformation("Updated bundle paths in changelog configuration: {ConfigPath}", configPath);
}
catch (IOException ex)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Text;
using AwesomeAssertions;
using Elastic.Changelog.Bundling;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Diagnostics;
using Microsoft.Extensions.Logging.Abstractions;
Expand Down Expand Up @@ -6124,6 +6125,59 @@ await FileSystem.File.WriteAllTextAsync(configPath,
bundleContent.Should().Contain("release-date:", "release date should be auto-populated when bundle.release_dates is true");
}

[Fact]
public async Task BundleChangelogs_WithBomPrefixedInput_ProducesNormalizedOutput()
{
// Arrange - Create changelog with BOM prefix
// language=yaml
var changelogContent =
"""
title: Test changelog with BOM
type: feature
products:
- product: elasticsearch
target: 9.2.0
lifecycle: ga
prs:
- https://github.com/elastic/elasticsearch/pull/123
""";

// Add UTF-8 BOM to the content
var contentWithBom = ChangelogUtf8Normalization.Utf8BomChar + changelogContent;
var changelogFile = FileSystem.Path.Join(_changelogDir, "changelog-with-bom.yaml");

// Write the file with BOM using explicit encoding
await FileSystem.File.WriteAllTextAsync(changelogFile, contentWithBom, Encoding.UTF8, TestContext.Current.CancellationToken);

// Verify the source file has BOM by reading as bytes
var sourceBytes = await FileSystem.File.ReadAllBytesAsync(changelogFile, TestContext.Current.CancellationToken);
ChangelogUtf8Normalization.HasUtf8Bom(sourceBytes).Should().BeTrue("source file should contain BOM");

var outputPath = FileSystem.Path.Join(Paths.WorkingDirectoryRoot.FullName, Guid.NewGuid().ToString(), "bundle.yaml");
var input = new BundleChangelogsArguments
{
Directory = _changelogDir,
All = true,
Output = outputPath
};

// Act
var result = await Service.BundleChangelogs(Collector, input, TestContext.Current.CancellationToken);

// Assert
result.Should().BeTrue("bundling should succeed");
Collector.Errors.Should().Be(0);

// Verify output file does not contain BOM
var outputBytes = await FileSystem.File.ReadAllBytesAsync(outputPath, TestContext.Current.CancellationToken);
ChangelogUtf8Normalization.HasUtf8Bom(outputBytes).Should().BeFalse("bundled output should not contain UTF-8 BOM");

// Verify content refs (bundle uses file refs + checksum unless resolve inlines entries)
var bundleContent = await FileSystem.File.ReadAllTextAsync(outputPath, TestContext.Current.CancellationToken);
bundleContent.Should().Contain("changelog-with-bom.yaml");
bundleContent.Should().Contain("entries:");
}

private void CreateSampleChangelogs()
{
// language=yaml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
// See the LICENSE file in the project root for more information

using System.IO.Abstractions.TestingHelpers;
using System.Text;
using AwesomeAssertions;
using Elastic.Changelog.Creation;
using Elastic.Changelog.GitHub;
using Elastic.Changelog.Tests.Changelogs;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using FakeItEasy;

Expand Down Expand Up @@ -230,4 +232,43 @@ public async Task CreateChangelog_TempOutputDirectory_Succeeds()
writeFs.Directory.Exists(tempOutput).Should().BeTrue();
writeFs.Directory.GetFiles(tempOutput, "*.yaml").Should().NotBeEmpty();
}

[Fact]
public async Task CreateChangelog_OutputDoesNotContainBom()
{
await WriteConfig(ConfigWithProductLabels);
var tempOutput = Path.Join(Paths.WorkingDirectoryRoot.FullName, Guid.NewGuid().ToString());
FileSystem.Directory.CreateDirectory(tempOutput);

var service = new ChangelogCreationService(LoggerFactory, ConfigurationContext, _mockGitHub, FileSystem, null);
var input = new CreateChangelogArguments
{
Title = "Test BOM handling",
Type = "feature",
Products = [new ProductArgument { Product = "elasticsearch", Target = "9.1.0", Lifecycle = "ga" }],
Config = Path.Join(Paths.WorkingDirectoryRoot.FullName, "config", "changelog.yml"),
Output = tempOutput,
Concise = true
};

// Act
var result = await service.CreateChangelog(Collector, input, TestContext.Current.CancellationToken);

// Assert
result.Should().BeTrue("changelog creation should succeed");
Collector.Errors.Should().Be(0);

// Verify created file does not contain BOM
var yamlFiles = FileSystem.Directory.GetFiles(tempOutput, "*.yaml");
yamlFiles.Should().NotBeEmpty("should create a YAML file");

var yamlFile = yamlFiles[0];
var bytes = await FileSystem.File.ReadAllBytesAsync(yamlFile, TestContext.Current.CancellationToken);
ChangelogUtf8Normalization.HasUtf8Bom(bytes).Should().BeFalse("created changelog should not contain UTF-8 BOM");

// Verify content is correct
var content = await FileSystem.File.ReadAllTextAsync(yamlFile, TestContext.Current.CancellationToken);
content.Should().Contain("Test BOM handling");
content.Should().Contain("type: feature");
}
}
Loading
Loading