|
| 1 | +require 'test_helper' |
| 2 | +require 'nokogiri' |
| 3 | + |
| 4 | +class TikaRegexTest < Marcel::TestCase |
| 5 | + test "converts simple pattern" do |
| 6 | + pattern = '^BZh[1-9]' |
| 7 | + result = Marcel::TikaRegex.to_ruby_regexp(pattern) |
| 8 | + |
| 9 | + assert_instance_of Regexp, result |
| 10 | + assert_equal(/^BZh[1-9]/, result) |
| 11 | + end |
| 12 | + |
| 13 | + test "removes multiple dotall flags" do |
| 14 | + pattern = '(?s)first(?s)second' |
| 15 | + result = Marcel::TikaRegex.to_ruby_regexp(pattern) |
| 16 | + |
| 17 | + assert_instance_of Regexp, result |
| 18 | + assert_equal 'firstsecond', result.source |
| 19 | + assert_equal Regexp::MULTILINE, result.options & Regexp::MULTILINE |
| 20 | + end |
| 21 | + |
| 22 | + test "returns nil for incompatible pattern" do |
| 23 | + # Variable-length lookbehind is not supported in Ruby |
| 24 | + pattern = '(?<=[\\x00][^\\x00]{0,10})[A-Z]' |
| 25 | + result = Marcel::TikaRegex.to_ruby_regexp(pattern) |
| 26 | + |
| 27 | + assert_nil result, "Incompatible pattern should return nil" |
| 28 | + end |
| 29 | + |
| 30 | + test "returns nil for nil input" do |
| 31 | + result = Marcel::TikaRegex.to_ruby_regexp(nil) |
| 32 | + assert_nil result |
| 33 | + end |
| 34 | + |
| 35 | + test "returns nil for empty string" do |
| 36 | + result = Marcel::TikaRegex.to_ruby_regexp('') |
| 37 | + assert_nil result |
| 38 | + end |
| 39 | + |
| 40 | + test "handles character class overlaps silently" do |
| 41 | + pattern = '[a-zA-Z][A-Za-z0-9_]' |
| 42 | + |
| 43 | + # Capture stderr to check for warnings |
| 44 | + old_stderr = $stderr |
| 45 | + $stderr = StringIO.new |
| 46 | + |
| 47 | + result = Marcel::TikaRegex.to_ruby_regexp(pattern) |
| 48 | + |
| 49 | + warnings = $stderr.string |
| 50 | + $stderr = old_stderr |
| 51 | + |
| 52 | + assert_instance_of Regexp, result |
| 53 | + assert_equal '', warnings, "Should not produce warnings" |
| 54 | + end |
| 55 | + |
| 56 | + test "handles multiple flags" do |
| 57 | + pattern = '(?i)(?s)<html>.*</html>' |
| 58 | + result = Marcel::TikaRegex.to_ruby_regexp(pattern) |
| 59 | + |
| 60 | + assert_instance_of Regexp, result |
| 61 | + assert result.match?("<HTML>\n</HTML>"), "Should be case-insensitive and multiline" |
| 62 | + assert result.match?("<html>\ntest\n</html>"), "Should match content across lines" |
| 63 | + end |
| 64 | + |
| 65 | + test "compiles all regex patterns from tika.xml" do |
| 66 | + # MIME types with known incompatible patterns |
| 67 | + # These patterns use Java-specific regex features not supported by Ruby |
| 68 | + ignore_list = %w[application/x-dbf] |
| 69 | + |
| 70 | + doc = Nokogiri::XML(File.new('data/tika.xml')) |
| 71 | + patterns_by_type = {} |
| 72 | + |
| 73 | + # Extract all regex patterns from tika.xml |
| 74 | + (doc/'mime-info/mime-type').each do |mime| |
| 75 | + type = mime['type'] |
| 76 | + |
| 77 | + (mime/'magic/match[@type="regex"]').each do |match| |
| 78 | + patterns_by_type[type] ||= [] |
| 79 | + patterns_by_type[type] << match['value'] |
| 80 | + end |
| 81 | + end |
| 82 | + |
| 83 | + patterns_by_type.each do |mime_type, patterns| |
| 84 | + patterns.each do |pattern| |
| 85 | + next if ignore_list.include?(mime_type) |
| 86 | + |
| 87 | + result = Marcel::TikaRegex.to_ruby_regexp(pattern) |
| 88 | + assert_instance_of Regexp, result, "Pattern for #{mime_type} should compile to Regexp: #{pattern}" |
| 89 | + end |
| 90 | + end |
| 91 | + end |
| 92 | +end |
0 commit comments