From c0fb50d8ff7795ffcb89de531304fc59a97e91a0 Mon Sep 17 00:00:00 2001 From: Cas Donoghue Date: Wed, 10 Dec 2025 10:24:21 -0800 Subject: [PATCH] Remove duplicate gems when producting logstash artifacts (#18340) * Remove duplicate gems when producting logstash artifacts Bundler is used to manage a gem environment that is shipped with logstash artifacts. By default, bundler will install newer/duplicate gems than shipped with ruby distributions (in logstash's case jruby). Duplicate gems in the shipped environment can cause issues with code loading with ambiguous gem specs or gem activation issues. This commit adds a step to compute the duplicate gems managed with bundler (and therefore direct/transitive dependencies of logstash/plugins) and *removes* copies shipped with jruby. Note that there are two locations to do the deduplication at. Both the stdlib gems as well as what jruby refers to as "bundled" gems. The existing pattern for excluding files from artifacts is used to implement the deduplication. * only remove gemspecs for duplicated stdlib gems * Make deduplicate a separate rake task and prevent gradle errors Deduplication should happen as a depenedency of installing default gems. In the current workflow we have a top level gradle task for packaging which calls out to rake. Rake then invokes a *separate* gradle process. When we modify the jruby default, when the separate gradle process goes to check of jruby is installed, it sees a modified jruby and tries to re-install. We work around this by changing how gradle detects if jruby is required to be installed. * Ensure the set of gems tested at unit level matches packages This commit adds the installDefaultGems task to the unit test tasks. This ensures that the gem env tested at the unit level matches the deduplicated one at the integration/acceptance level. Takes over https://github.com/elastic/logstash/pull/18330 * WIP: Use logstash_gem_home for gemInstaller This commit changes gemInstaller such that the centralized gem_home from Logstash::Environment is used instead of hard coding in a fragile path. The tests were the only consumer of the optional positional parameter in the `install` class method. * Fix gem env setup for ruby unit tests After some deeeeeeeep diving into comparing the state of running logstash from a compiled artifact vs the unit tests i finally figured out that the use of the bundler `setup!` method in unit tests is imcompatible with a couple of tests. Specifically that method puts bundler installed gems ahead of the standard lib gems in the load path. This commit solves that by re-positioning the standarl lib back to the front of the load path. * Show how using `--prefer-local` causes issues Ideally bundler will consider default/stdlib gems when doing dependency resolution to avoid duplication in the first place. this seems to break the pluginmanager. Verify this happens in CI... * Revert "Show how using `--prefer-local` causes issues" This reverts commit 5a3b2bb2ac6f79170a137f5668e189ec22016b81. * fix rebase error (cherry picked from commit e08abb8c006c4f0eae5c3a8b11de9f886ee84d5b) --- build.gradle | 2 + lib/bootstrap/rspec.rb | 10 ++++ lib/pluginmanager/gem_installer.rb | 10 ++-- rakelib/artifacts.rake | 12 ----- rakelib/plugin.rake | 53 +++++++++++++++++++ rubyUtils.gradle | 12 ++++- .../unit/plugin_manager/gem_installer_spec.rb | 25 ++++++--- 7 files changed, 97 insertions(+), 27 deletions(-) diff --git a/build.gradle b/build.gradle index 890441ca650..ebde43d5e7a 100644 --- a/build.gradle +++ b/build.gradle @@ -420,6 +420,7 @@ project(":logstash-core") { tasks.getByPath(":logstash-core:" + tsk).configure { dependsOn copyPluginTestAlias dependsOn installDevelopmentGems + dependsOn installDefaultGems } } } @@ -984,6 +985,7 @@ if (System.getenv('OSS') != 'true') { ["rubyTests", "rubyIntegrationTests", "test"].each { tsk -> tasks.getByPath(":logstash-xpack:" + tsk).configure { dependsOn installDevelopmentGems + dependsOn installDefaultGems } } } diff --git a/lib/bootstrap/rspec.rb b/lib/bootstrap/rspec.rb index 782455135ee..db6cdeb7571 100755 --- a/lib/bootstrap/rspec.rb +++ b/lib/bootstrap/rspec.rb @@ -17,6 +17,16 @@ require_relative "environment" LogStash::Bundler.setup!({:without => [:build]}) +# Our use of LogStash::Bundler.setup! here leaves us in kind of a wonky state for *all* tests +# Essentially we end up with a load path that favors bundlers gem env over stdlib. This is +# not really the call stack in logstash itself, so while this does make the full bundled gem +# env available for tests, it also has a quirk where stdlib gems are not loaed correctly. The +# following patch ensures that stdlib gems are bumped to the front of the load path for unit +# tests. +## START PATCH ## +jruby_stdlib = $LOAD_PATH.find { |p| p.end_with?('vendor/jruby/lib/ruby/stdlib') } +$LOAD_PATH.unshift($LOAD_PATH.delete(jruby_stdlib)) if jruby_stdlib +## END PATCH ## require "logstash-core" require "logstash/environment" diff --git a/lib/pluginmanager/gem_installer.rb b/lib/pluginmanager/gem_installer.rb index e5560a78025..17eb889ef8e 100644 --- a/lib/pluginmanager/gem_installer.rb +++ b/lib/pluginmanager/gem_installer.rb @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +require "bootstrap/environment" require "pluginmanager/ui" require "pathname" require "rubygems/package" @@ -26,17 +27,16 @@ module LogStash module PluginManager # - Generate the specifications # - Copy the data in the right folders class GemInstaller - GEM_HOME = Pathname.new(::File.join(LogStash::Environment::BUNDLE_DIR, "jruby", "3.1.0")) SPECIFICATIONS_DIR = "specifications" GEMS_DIR = "gems" CACHE_DIR = "cache" attr_reader :gem_home - def initialize(gem_file, display_post_install_message = false, gem_home = GEM_HOME) + def initialize(gem_file, display_post_install_message = false) @gem_file = gem_file @gem = ::Gem::Package.new(@gem_file) - @gem_home = Pathname.new(gem_home) + @gem_home = Pathname.new(LogStash::Environment.logstash_gem_home) @display_post_install_message = display_post_install_message end @@ -48,8 +48,8 @@ def install post_install_message end - def self.install(gem_file, display_post_install_message = false, gem_home = GEM_HOME) - self.new(gem_file, display_post_install_message, gem_home).install + def self.install(gem_file, display_post_install_message = false) + self.new(gem_file, display_post_install_message).install end private diff --git a/rakelib/artifacts.rake b/rakelib/artifacts.rake index 1c52f03bf84..94e33005f5b 100644 --- a/rakelib/artifacts.rake +++ b/rakelib/artifacts.rake @@ -103,18 +103,6 @@ namespace "artifact" do @exclude_paths << 'vendor/**/gems/**/Gemfile.lock' @exclude_paths << 'vendor/**/gems/**/Gemfile' - @exclude_paths << 'vendor/jruby/lib/ruby/gems/shared/gems/rake-*' - # exclude ruby-maven-libs 3.3.9 jars until JRuby ships with >= 3.8.9 - @exclude_paths << 'vendor/bundle/jruby/**/gems/ruby-maven-libs-3.3.9/**/*' - - # remove this after JRuby includes rexml 3.3.x - @exclude_paths << 'vendor/jruby/lib/ruby/gems/shared/gems/rexml-3.2.5/**/*' - @exclude_paths << 'vendor/jruby/lib/ruby/gems/shared/specifications/rexml-3.2.5.gemspec' - - # remove this after JRuby includes net-imap-0.2.4+ - @exclude_paths << 'vendor/jruby/lib/ruby/gems/shared/specifications/net-imap-0.2.3.gemspec' - @exclude_paths << 'vendor/jruby/lib/ruby/gems/shared/gems/net-imap-0.2.3/**/*' - # Exclude env2yaml source files - only compiled classes should be in tarball @exclude_paths << 'docker/data/logstash/env2yaml/**/*.java' @exclude_paths << 'docker/data/logstash/env2yaml/build.gradle' diff --git a/rakelib/plugin.rake b/rakelib/plugin.rake index 47d572f417a..3742b59b8d5 100644 --- a/rakelib/plugin.rake +++ b/rakelib/plugin.rake @@ -90,12 +90,65 @@ namespace "plugin" do task.reenable # Allow this task to be run again end # task "install" + + task "clean-duplicate-gems" do + shared_gems_path = File.join(LogStash::Environment::LOGSTASH_HOME, + 'vendor/jruby/lib/ruby/gems/shared/gems') + default_gemspecs_path = File.join(LogStash::Environment::LOGSTASH_HOME, + 'vendor/jruby/lib/ruby/gems/shared/specifications/default') + bundle_gems_path = File.join(LogStash::Environment::BUNDLE_DIR, + 'jruby/*/gems') + + # "bundled" gems in jruby + # https://github.com/jruby/jruby/blob/024123c29d73b672d50730117494f3e4336a0edb/lib/pom.rb#L108-L152 + shared_gem_names = Dir.glob(File.join(shared_gems_path, '*')).map do |path| + match = File.basename(path).match(/^(.+?)-\d+/) + match ? match[1] : nil + end.compact + + # "default" gems in jruby/ruby + # https://github.com/jruby/jruby/blob/024123c29d73b672d50730117494f3e4336a0edb/lib/pom.rb#L21-L106 + default_gem_names = Dir.glob(File.join(default_gemspecs_path, '*.gemspec')).map do |path| + match = File.basename(path).match(/^(.+?)-\d+/) + match ? match[1] : nil + end.compact + + # gems we explicitly manage with bundler (we always want these to take precedence) + bundle_gem_names = Dir.glob(File.join(bundle_gems_path, '*')).map do |path| + match = File.basename(path).match(/^(.+?)-\d+/) + match ? match[1] : nil + end.compact + + shared_duplicates = shared_gem_names & bundle_gem_names + default_duplicates = default_gem_names & bundle_gem_names + all_duplicates = (shared_duplicates + default_duplicates).uniq + + puts("[plugin:clean-duplicate-gems] Removing duplicate gems: #{all_duplicates.sort.join(', ')}") + + # Remove shared/bundled gem duplicates + shared_duplicates.each do |gem_name| + FileUtils.rm_rf(Dir.glob("#{shared_gems_path}/#{gem_name}-*")) + FileUtils.rm_rf(Dir.glob("#{shared_gems_path}/../specifications/#{gem_name}-*.gemspec")) + end + + # Remove default gem gemspecs only + default_duplicates.each do |gem_name| + # For stdlib default gems we only remove the gemspecs as removing the source code + # files results in code loading errors and ruby warnings + FileUtils.rm_rf(Dir.glob("#{default_gemspecs_path}/#{gem_name}-*.gemspec")) + end + + task.reenable + end + task "install-default" => "bootstrap" do puts("[plugin:install-default] Installing default plugins") remove_lockfile # because we want to use the release lockfile install_plugins("--no-verify", "--preserve", *LogStash::RakeLib::DEFAULT_PLUGINS) + # Clean duplicates after full gem resolution + Rake::Task["plugin:clean-duplicate-gems"].invoke task.reenable # Allow this task to be run again end diff --git a/rubyUtils.gradle b/rubyUtils.gradle index 22818f0f2dd..f3d332b363b 100644 --- a/rubyUtils.gradle +++ b/rubyUtils.gradle @@ -279,7 +279,11 @@ tasks.register("installCustomJRuby", Copy) { dependsOn buildCustomJRuby description "Install custom built JRuby in the vendor directory" inputs.file(customJRubyTar) - outputs.dir("${projectDir}/vendor/jruby") + // Don't re-extract if core JRuby is already installed. This works around + // gem deduplication when rake calls back in to gradle. + onlyIf { + !file("${projectDir}/vendor/jruby/bin/jruby").exists() + } from tarTree(customJRubyTar == "" ? jrubyTarPath : customJRubyTar) eachFile { f -> f.path = f.path.replaceFirst("^jruby-${customJRubyVersion}", '') @@ -294,7 +298,11 @@ tasks.register("downloadAndInstallJRuby", Copy) { dependsOn=[verifyFile, installCustomJRuby] description "Install JRuby in the vendor directory" inputs.file(jrubyTarPath) - outputs.dir("${projectDir}/vendor/jruby") + // Don't re-extract if core JRuby is already installed. This works around + // gem deduplication when rake calls back in to gradle. + onlyIf { + !file("${projectDir}/vendor/jruby/bin/jruby").exists() + } from tarTree(downloadJRuby.dest) eachFile { f -> f.path = f.path.replaceFirst("^jruby-${jRubyVersion}", '') diff --git a/spec/unit/plugin_manager/gem_installer_spec.rb b/spec/unit/plugin_manager/gem_installer_spec.rb index dd85009dfff..89ee9c93a51 100644 --- a/spec/unit/plugin_manager/gem_installer_spec.rb +++ b/spec/unit/plugin_manager/gem_installer_spec.rb @@ -27,18 +27,27 @@ let(:simple_gem) { ::File.join(::File.dirname(__FILE__), "..", "..", "support", "pack", "valid-pack", "logstash", "valid-pack", "#{plugin_name}.gem") } subject { described_class } - let(:temporary_gem_home) { p = Stud::Temporary.pathname; FileUtils.mkdir_p(p); p } + let(:gem_home) { LogStash::Environment.logstash_gem_home } + # Clean up installed gems after each test + after(:each) do + spec_file = ::File.join(gem_home, "specifications", "#{plugin_name}.gemspec") + FileUtils.rm_f(spec_file) if ::File.exist?(spec_file) + gem_dir = ::File.join(gem_home, "gems", plugin_name) + FileUtils.rm_rf(gem_dir) if Dir.exist?(gem_dir) + cache_file = ::File.join(gem_home, "cache", "#{plugin_name}.gem") + FileUtils.rm_f(cache_file) if ::File.exist?(cache_file) + end it "install the specifications in the spec dir" do - subject.install(simple_gem, false, temporary_gem_home) - spec_file = ::File.join(temporary_gem_home, "specifications", "#{plugin_name}.gemspec") + subject.install(simple_gem, false) + spec_file = ::File.join(gem_home, "specifications", "#{plugin_name}.gemspec") expect(::File.exist?(spec_file)).to be_truthy expect(::File.size(spec_file)).to be > 0 end it "install the gem in the gems dir" do - subject.install(simple_gem, false, temporary_gem_home) - gem_dir = ::File.join(temporary_gem_home, "gems", plugin_name) + subject.install(simple_gem, false) + gem_dir = ::File.join(gem_home, "gems", plugin_name) expect(Dir.exist?(gem_dir)).to be_truthy end @@ -50,13 +59,13 @@ context "when we want the message" do it "display the message" do - expect(subject.install(simple_gem, true, temporary_gem_home)).to eq(message) + expect(subject.install(simple_gem, true)).to eq(message) end end context "when we dont want the message" do it "doesn't display the message" do - expect(subject.install(simple_gem, false, temporary_gem_home)).to be_nil + expect(subject.install(simple_gem, false)).to be_nil end end end @@ -65,7 +74,7 @@ context "when we don't want the message" do it "doesn't display the message" do expect(LogStash::PluginManager.ui).not_to receive(:info).with(message) - subject.install(simple_gem, true, temporary_gem_home) + subject.install(simple_gem, true) end end end