Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Text.RegularExpressions;
using Elastic.Documentation.Configuration.Serialization;
using Elastic.Documentation.ReleaseNotes;
using Elastic.Documentation.Text;
using YamlDotNet.Serialization;
using YamlDotNet.Serialization.NamingConventions;

Expand Down Expand Up @@ -50,6 +51,7 @@ public static partial class ReleaseNotesSerialization
/// </summary>
public static ChangelogEntry DeserializeEntry(string yaml)
{
yaml = Utf8TextNormalization.StripLeadingUtf8Bom(yaml)!;
var yamlDto = YamlDeserializer.Deserialize<ChangelogEntryDto>(yaml);
return ToEntry(yamlDto);
}
Expand All @@ -71,6 +73,7 @@ public static ChangelogEntry DeserializeEntry(string yaml)
/// </summary>
public static Bundle DeserializeBundle(string yaml)
{
yaml = Utf8TextNormalization.StripLeadingUtf8Bom(yaml)!;
var yamlDto = YamlDeserializer.Deserialize<BundleDto>(yaml);
return ToBundle(yamlDto);
}
Expand Down Expand Up @@ -364,6 +367,7 @@ private static ChangelogEntryType ParseEntryType(string? value)
/// <returns>The normalized YAML content.</returns>
public static string NormalizeYaml(string yaml)
{
yaml = Utf8TextNormalization.StripLeadingUtf8Bom(yaml)!;
// Skip comment lines
var yamlLines = yaml.Split('\n');
var yamlWithoutComments = string.Join('\n', yamlLines.Where(line => !line.TrimStart().StartsWith('#')));
Expand Down
57 changes: 57 additions & 0 deletions src/Elastic.Documentation/Text/Utf8TextNormalization.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

namespace Elastic.Documentation.Text;

/// <summary>
/// UTF-8 text normalization utilities for handling Byte Order Marks (BOMs) and related text encoding concerns.
/// </summary>
public static class Utf8TextNormalization
{
/// <summary>
/// UTF-8 Byte Order Mark character (U+FEFF Zero Width No-Break Space).
/// </summary>
public const char Utf8BomChar = '\uFEFF';

/// <summary>
/// UTF-8 Byte Order Mark byte sequence (EF BB BF).
/// </summary>
public static readonly byte[] Utf8BomBytes = [0xEF, 0xBB, 0xBF];

/// <summary>
/// Strips all consecutive leading UTF-8 BOM characters (U+FEFF) from the beginning of a string.
/// <para>
/// This method removes the UTF-8 Byte Order Mark / Zero Width No-Break Space character only.
/// It does NOT strip other zero-width characters like U+200B (Zero Width Space) or U+2060 (Word Joiner)
/// as they can appear in legitimate content and are not part of the UTF-8 BOM sequence.
/// </para>
/// </summary>
/// <param name="text">The input string, which may be null or empty.</param>
/// <returns>The string with leading BOM characters removed, or the original string if null/empty or no BOM present.</returns>
public static string? StripLeadingUtf8Bom(string? text)
{
if (string.IsNullOrEmpty(text))
return text;

// Strip all consecutive leading U+FEFF characters
var span = text.AsSpan();
while (span.Length > 0 && span[0] == Utf8BomChar)
{
span = span[1..];
}

return span.Length == text.Length ? text : span.ToString();
}

/// <summary>
/// Checks if the given byte span starts with the UTF-8 Byte Order Mark sequence (EF BB BF).
/// </summary>
/// <param name="bytes">The byte span to check.</param>
/// <returns>True if the span starts with the UTF-8 BOM sequence, false otherwise.</returns>
public static bool HasUtf8Bom(ReadOnlySpan<byte> bytes) =>
bytes.Length >= 3 &&
bytes[0] == 0xEF &&
bytes[1] == 0xBB &&
bytes[2] == 0xBF;
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Text;
using System.Text.RegularExpressions;
using Elastic.Changelog.Configuration;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.Assembler;
using Elastic.Documentation.Configuration.Changelog;
Expand Down Expand Up @@ -50,6 +51,11 @@ public partial class ChangelogBundleAmendService(
ScopedFileSystem? fileSystem = null,
IConfigurationContext? configurationContext = null) : IService
{
/// <summary>
/// UTF-8 encoding without BOM for writing YAML files.
/// </summary>
private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false);

private readonly ILogger _logger = logFactory.CreateLogger<ChangelogBundleAmendService>();
private readonly IFileSystem _fileSystem = fileSystem ?? FileSystemFactory.RealRead;
private readonly ChangelogConfigurationLoader? _configLoader = configurationContext != null
Expand Down Expand Up @@ -256,7 +262,9 @@ public async Task<bool> AmendBundle(IDiagnosticsCollector collector, AmendBundle
if (!string.IsNullOrWhiteSpace(outputDir) && !_fileSystem.Directory.Exists(outputDir))
_ = _fileSystem.Directory.CreateDirectory(outputDir);

await _fileSystem.File.WriteAllTextAsync(amendFilePath, yaml, Encoding.UTF8, ctx);
// Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility
var normalizedYaml = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yaml);
await _fileSystem.File.WriteAllTextAsync(amendFilePath, normalizedYaml, Utf8NoBom, ctx);
_logger.LogInformation("Created amend file: {AmendFilePath} with {Count} entries", amendFilePath, entries.Count);

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Elastic.Changelog.Configuration;
using Elastic.Changelog.GitHub;
using Elastic.Changelog.Rendering;
using Elastic.Changelog.Utilities;
using Elastic.Documentation;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.Assembler;
Expand Down Expand Up @@ -137,6 +138,11 @@ public partial class ChangelogBundlingService(
? new ChangelogConfigurationLoader(logFactory, configurationContext, fileSystem ?? FileSystemFactory.RealRead)
: null;

/// <summary>
/// UTF-8 encoding without BOM for writing YAML files.
/// </summary>
private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false);

[GeneratedRegex(@"(\s+)version:", RegexOptions.Multiline)]
internal static partial Regex VersionToTargetRegex();

Expand Down Expand Up @@ -766,7 +772,9 @@ private async Task WriteBundleFileAsync(Bundle bundledData, string outputPath, C
}

// Write bundled file with explicit UTF-8 encoding to ensure proper character handling
await _fileSystem.File.WriteAllTextAsync(outputPath, bundledYaml, Encoding.UTF8, ctx);
// Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility
var normalizedYaml = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(bundledYaml);
await _fileSystem.File.WriteAllTextAsync(outputPath, normalizedYaml, Utf8NoBom, ctx);
_logger.LogInformation("Created bundled changelog: {OutputPath}", outputPath);
}

Expand Down
10 changes: 8 additions & 2 deletions src/services/Elastic.Changelog/Creation/ChangelogFileWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System.IO.Abstractions;
using System.Text;
using Elastic.Changelog.Utilities;
using Elastic.Documentation;
using Elastic.Documentation.Configuration.Changelog;
using Elastic.Documentation.Configuration.ReleaseNotes;
Expand All @@ -18,6 +19,10 @@ namespace Elastic.Changelog.Creation;
/// </summary>
public class ChangelogFileWriter(IFileSystem fileSystem, ILogger logger)
{
/// <summary>
/// UTF-8 encoding without BOM for writing YAML files.
/// </summary>
private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false);
/// <summary>
/// Writes a changelog file with the given data.
/// </summary>
Expand Down Expand Up @@ -46,8 +51,9 @@ public async Task<bool> WriteChangelogAsync(
var filename = GenerateFilename(collector, input);
var filePath = fileSystem.Path.Join(outputDir, filename);

// Write file with explicit UTF-8 encoding to ensure proper character handling
await fileSystem.File.WriteAllTextAsync(filePath, yamlContent, Encoding.UTF8, ctx);
// Write UTF-8 text without BOM using explicit encoding instance.
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent);
await fileSystem.File.WriteAllTextAsync(filePath, normalizedContent, Utf8NoBom, ctx);
logger.LogInformation("Created changelog fragment: {FilePath}", filePath);

return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
// See the LICENSE file in the project root for more information

using System.IO.Abstractions;
using System.Text;
using System.Text.Json;
using Actions.Core.Services;
using Elastic.Changelog.Configuration;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Diagnostics;
using Elastic.Documentation.Services;
Expand All @@ -21,6 +23,11 @@ public class ChangelogPrepareArtifactService(
IFileSystem? fileSystem = null
) : IService
{
/// <summary>
/// UTF-8 encoding without BOM for writing YAML files.
/// </summary>
private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false);

private readonly ILogger _logger = logFactory.CreateLogger<ChangelogPrepareArtifactService>();
private readonly IFileSystem _fileSystem = fileSystem ?? new FileSystem();
private readonly ChangelogConfigurationLoader _configLoader = new(logFactory, configurationContext, fileSystem ?? new FileSystem());
Expand Down Expand Up @@ -48,8 +55,11 @@ public async Task<bool> PrepareArtifact(IDiagnosticsCollector collector, Prepare
_logger.LogInformation("Reusing existing filename {Filename} for stable path on branch", changelogFilename);

var destYaml = _fileSystem.Path.Combine(input.OutputDir, changelogFilename);
_fileSystem.File.Copy(sourceYaml, destYaml, overwrite: true);
_logger.LogInformation("Copied changelog YAML: {Source} → {Dest}", sourceYaml, destYaml);
// Read YAML, normalize to remove any BOM, then write UTF-8 bytes without BOM (avoids provider-specific WriteAllText preamble behavior).
var yamlContent = await _fileSystem.File.ReadAllTextAsync(sourceYaml, ctx);
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent);
await _fileSystem.File.WriteAllTextAsync(destYaml, normalizedContent, Utf8NoBom, ctx);
_logger.LogInformation("Normalized and copied changelog YAML: {Source} → {Dest}", sourceYaml, destYaml);
}
else
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using Elastic.Changelog.Bundling;
using Elastic.Changelog.Configuration;
using Elastic.Changelog.GitHub;
using Elastic.Changelog.Utilities;
using Elastic.Documentation;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Configuration.Changelog;
Expand Down Expand Up @@ -86,6 +87,11 @@ public class GitHubReleaseChangelogService(
ChangelogBundlingService? bundlingService = null
) : IService
{
/// <summary>
/// UTF-8 encoding without BOM for writing YAML files.
/// </summary>
private static readonly UTF8Encoding Utf8NoBom = new(encoderShouldEmitUTF8Identifier: false);

private readonly ILogger _logger = logFactory.CreateLogger<GitHubReleaseChangelogService>();
private readonly IFileSystem _fileSystem = fileSystem ?? FileSystemFactory.RealRead;
private readonly ChangelogConfigurationLoader _configLoader = new(logFactory, configurationContext, fileSystem ?? FileSystemFactory.RealRead);
Expand Down Expand Up @@ -301,7 +307,9 @@ private async Task<bool> ProcessPrReference(
var slug = ChangelogTextUtilities.GenerateSlug(title);
var filename = $"{prRef.PrNumber}-{finalType.ToStringFast(true)}-{slug}.yaml";
var filePath = _fileSystem.Path.Join(outputDir, filename);
await _fileSystem.File.WriteAllTextAsync(filePath, yamlContent, Encoding.UTF8, ctx);
// Strip any leading BOM to ensure clean UTF-8 output for tooling compatibility
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(yamlContent);
await _fileSystem.File.WriteAllTextAsync(filePath, normalizedContent, Utf8NoBom, ctx);

createdFiles.Add(filename);
_logger.LogDebug("Created changelog: {FilePath}", filePath);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System;
using Elastic.Documentation.Text;

namespace Elastic.Changelog.Utilities;

/// <summary>
/// Utilities for normalizing UTF-8 encoding in changelog YAML files.
/// Ensures YAML output is UTF-8 without BOM for better tooling compatibility and review ergonomics.
/// This class now serves as a thin forwarder to the shared UTF-8 text normalization utilities.
/// </summary>
public static class ChangelogUtf8Normalization
{
/// <summary>
/// UTF-8 Byte Order Mark character (U+FEFF).
/// </summary>
public const char Utf8BomChar = Utf8TextNormalization.Utf8BomChar;

/// <summary>
/// UTF-8 Byte Order Mark as byte sequence (EF BB BF).
/// </summary>
public static readonly byte[] Utf8BomBytes = Utf8TextNormalization.Utf8BomBytes;

/// <summary>
/// Strips the leading UTF-8 BOM character from a string if present.
/// YAML should be UTF-8 without BOM for tooling and review ergonomics.
/// </summary>
/// <param name="text">The text to normalize</param>
/// <returns>Text with leading BOM character removed if it was present</returns>
public static string StripLeadingUtf8BomChar(string text) =>
Utf8TextNormalization.StripLeadingUtf8Bom(text)!;

/// <summary>
/// Checks if a byte span starts with the UTF-8 BOM sequence (EF BB BF).
/// </summary>
/// <param name="bytes">The byte span to check</param>
/// <returns>True if the span starts with UTF-8 BOM bytes</returns>
public static bool HasUtf8Bom(ReadOnlySpan<byte> bytes) =>
Utf8TextNormalization.HasUtf8Bom(bytes);
}
7 changes: 6 additions & 1 deletion src/tooling/docs-builder/Commands/ChangelogCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
using Elastic.Changelog.GithubRelease;
using Elastic.Changelog.Rendering;
using Elastic.Changelog.Uploading;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Diagnostics;
using Elastic.Documentation.ReleaseNotes;
Expand Down Expand Up @@ -162,6 +163,8 @@ public Task<int> Init(
try
{
var content = _fileSystem.File.ReadAllText(configPath);
// Strip any leading BOM that might be present after reading
content = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(content);

if (useNonDefaultChangelogDir)
{
Expand All @@ -175,7 +178,9 @@ public Task<int> Init(
content = BundleOutputDirectoryRegex().Replace(content, "$1" + outputValue);
}

_fileSystem.File.WriteAllText(configPath, content);
// Ensure normalized content is written without BOM
var normalizedContent = ChangelogUtf8Normalization.StripLeadingUtf8BomChar(content);
_fileSystem.File.WriteAllText(configPath, normalizedContent);
_logger.LogInformation("Updated bundle paths in changelog configuration: {ConfigPath}", configPath);
}
catch (IOException ex)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Text;
using AwesomeAssertions;
using Elastic.Changelog.Bundling;
using Elastic.Changelog.Utilities;
using Elastic.Documentation.Configuration;
using Elastic.Documentation.Diagnostics;
using Microsoft.Extensions.Logging.Abstractions;
Expand Down Expand Up @@ -6124,6 +6125,59 @@ await FileSystem.File.WriteAllTextAsync(configPath,
bundleContent.Should().Contain("release-date:", "release date should be auto-populated when bundle.release_dates is true");
}

[Fact]
public async Task BundleChangelogs_WithBomPrefixedInput_ProducesNormalizedOutput()
{
// Arrange - Create changelog with BOM prefix
// language=yaml
var changelogContent =
"""
title: Test changelog with BOM
type: feature
products:
- product: elasticsearch
target: 9.2.0
lifecycle: ga
prs:
- https://github.com/elastic/elasticsearch/pull/123
""";

// Add UTF-8 BOM to the content
var contentWithBom = ChangelogUtf8Normalization.Utf8BomChar + changelogContent;
var changelogFile = FileSystem.Path.Join(_changelogDir, "changelog-with-bom.yaml");

// Write the file with BOM using explicit encoding
await FileSystem.File.WriteAllTextAsync(changelogFile, contentWithBom, Encoding.UTF8, TestContext.Current.CancellationToken);

// Verify the source file has BOM by reading as bytes
var sourceBytes = await FileSystem.File.ReadAllBytesAsync(changelogFile, TestContext.Current.CancellationToken);
ChangelogUtf8Normalization.HasUtf8Bom(sourceBytes).Should().BeTrue("source file should contain BOM");

var outputPath = FileSystem.Path.Join(Paths.WorkingDirectoryRoot.FullName, Guid.NewGuid().ToString(), "bundle.yaml");
var input = new BundleChangelogsArguments
{
Directory = _changelogDir,
All = true,
Output = outputPath
};

// Act
var result = await Service.BundleChangelogs(Collector, input, TestContext.Current.CancellationToken);

// Assert
result.Should().BeTrue("bundling should succeed");
Collector.Errors.Should().Be(0);

// Verify output file does not contain BOM
var outputBytes = await FileSystem.File.ReadAllBytesAsync(outputPath, TestContext.Current.CancellationToken);
ChangelogUtf8Normalization.HasUtf8Bom(outputBytes).Should().BeFalse("bundled output should not contain UTF-8 BOM");

// Verify content refs (bundle uses file refs + checksum unless resolve inlines entries)
var bundleContent = await FileSystem.File.ReadAllTextAsync(outputPath, TestContext.Current.CancellationToken);
bundleContent.Should().Contain("changelog-with-bom.yaml");
bundleContent.Should().Contain("entries:");
}

private void CreateSampleChangelogs()
{
// language=yaml
Expand Down
Loading
Loading