Skip to content

Commit 61d8d7d

Browse files
committed
add tika regex support
This will work for simple things only because we're using a different regex engine. But out of all the current regular expressions, only the one for `application/x-dbf` fails. So I guess we're good. And we can get rid of that html definition now.
1 parent 3d3c5dc commit 61d8d7d

File tree

8 files changed

+231
-27
lines changed

8 files changed

+231
-27
lines changed

lib/marcel.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
module Marcel
44
require "marcel/version"
5+
require "marcel/tika_regex"
56
require "marcel/magic"
67
require "marcel/mime_type"
78
end

lib/marcel/magic.rb

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,14 @@ def self.magic_match(io, method)
126126

127127
def self.magic_match_io(io, matches, buffer)
128128
matches.any? do |offset, value, children|
129+
# Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all)
130+
next false if value.nil?
131+
129132
match =
130133
if value
131-
if Range === offset
134+
if value.is_a?(Regexp)
135+
match_regex(io, offset, value, buffer)
136+
elsif Range === offset
132137
io.read(offset.begin, buffer)
133138
x = io.read(offset.end - offset.begin + value.bytesize, buffer)
134139
x && x.include?(value)
@@ -143,6 +148,25 @@ def self.magic_match_io(io, matches, buffer)
143148
end
144149
end
145150

146-
private_class_method :magic_match, :magic_match_io
151+
def self.match_regex(io, offset, regexp, buffer)
152+
start = offset.is_a?(Range) ? offset.begin : offset
153+
io.read(start, buffer) if start > 0
154+
data = io.read(256, buffer)
155+
return false unless data
156+
157+
# I know, I know... this is awful, but the patterns come from Apache Tika
158+
# and we are getting warnings about character class overlaps, so we'll
159+
# suppress warnings for this match call.
160+
# I'm open to better ideas.
161+
begin
162+
old_verbose = $VERBOSE
163+
$VERBOSE = nil
164+
data.match?(regexp)
165+
ensure
166+
$VERBOSE = old_verbose
167+
end
168+
end
169+
170+
private_class_method :magic_match, :magic_match_io, :match_regex
147171
end
148172
end

lib/marcel/mime_type/definitions.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# frozen_string_literal: true
22

33
Marcel::MimeType.extend "text/plain", extensions: %w( txt asc )
4-
Marcel::MimeType.extend "text/html", magic: [[0..64, "<!DOCTYPE HTML"], [0..64, "<!DOCTYPE html"], [0..64, "<!doctype HTML"], [0..64, "<!doctype html"]]
54

65
Marcel::MimeType.extend "application/illustrator", parents: "application/pdf"
76
Marcel::MimeType.extend "image/vnd.adobe.photoshop", magic: [[0, "8BPS"]], extensions: %w( psd psb )

lib/marcel/tables.rb

Lines changed: 25 additions & 24 deletions
Large diffs are not rendered by default.

lib/marcel/tika_regex.rb

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# frozen_string_literal: true
2+
3+
module Marcel
4+
module TikaRegex
5+
# Apache Tika uses Java regex syntax, which has some differences from Ruby:
6+
# - (?s) flag in Java is a mode which makes . match newlines
7+
# In Ruby, this is equivalent to the multiline flag
8+
# - Naturally, some Java regex features are not supported in Ruby (e.g., variable-length lookbehinds)
9+
#
10+
# This method handles the conversion and gracefully returns nil for incompatible patterns.
11+
#
12+
# @param pattern [String] The Tika regex pattern string
13+
# @return [Regexp, nil] The compiled Ruby Regexp, or nil if the pattern is incompatible
14+
def self.to_ruby_regexp(pattern)
15+
return nil if pattern.nil? || pattern.empty?
16+
17+
processed = pattern.dup
18+
flags = 0
19+
20+
# Converting Java's (?s) dotall flag to Ruby's multiline
21+
if processed.include?('(?s)')
22+
processed = processed.gsub('(?s)', '')
23+
flags |= Regexp::MULTILINE
24+
end
25+
26+
# I know, I know... this is awful, but the patterns come from Apache Tika
27+
# and we are getting warnings about character class overlaps, so we'll
28+
# suppress warnings for this Regexp compilation.
29+
# I'm open to better ideas.
30+
old_verbose = $VERBOSE
31+
$VERBOSE = nil
32+
33+
Regexp.new(processed, flags).freeze
34+
rescue RegexpError
35+
nil
36+
ensure
37+
$VERBOSE = old_verbose
38+
end
39+
end
40+
end

script/generate_tables.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic.
55

66
require 'nokogiri'
7+
require_relative '../lib/marcel/tika_regex'
78

89
class String
910
alias inspect_old inspect
@@ -27,6 +28,16 @@ def inspect
2728
end
2829
end
2930

31+
class RegexString
32+
def initialize(pattern)
33+
@pattern = pattern
34+
end
35+
36+
def inspect
37+
"r[#{@pattern.inspect}]"
38+
end
39+
end
40+
3041
def str2int(s)
3142
return s.to_i(16) if s[0..1].downcase == '0x'
3243
return s.to_i(8) if s[0..0].downcase == '0'
@@ -39,6 +50,8 @@ def binary_strings(object)
3950
object.map { |o| binary_strings(o) }
4051
when String
4152
BinaryString.new(object)
53+
when RegexString
54+
object
4255
when Numeric, Range, nil
4356
object
4457
else
@@ -65,6 +78,8 @@ def get_matches(mime, parent)
6578

6679
offset = offset.size == 2 ? offset[0]..offset[1] : offset[0]
6780
case type
81+
when 'regex'
82+
value = RegexString.new(value)
6883
when 'string', 'stringignorecase'
6984
value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') }
7085
value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") }
@@ -231,6 +246,7 @@ def get_matches(mime, parent)
231246
end
232247
puts " }"
233248
puts " b = Hash.new { |h, k| h[k] = k.b.freeze }"
249+
puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }"
234250
puts " # @private"
235251
puts " # :nodoc:"
236252
puts " MAGIC = ["

test/magic_test.rb

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
require 'test_helper'
22
require 'rack'
3+
require 'nokogiri'
34

45
class Marcel::MimeType::MagicTest < Marcel::TestCase
56
# These fixtures should be recognisable given only their contents. Where a generic type
@@ -25,4 +26,34 @@ class Marcel::MimeType::MagicTest < Marcel::TestCase
2526
assert Marcel::Magic.child?('text/csv', 'text/plain')
2627
refute Marcel::Magic.child?('text/plain', 'text/csv')
2728
end
29+
30+
test ".match_regex" do
31+
ignore_list = %w[application/x-dbf]
32+
doc = Nokogiri::XML(File.new('data/tika.xml'))
33+
34+
(doc/'mime-info/mime-type').each do |mime|
35+
type = mime['type']
36+
next if ignore_list.include?(type)
37+
38+
(mime/'magic/match[@type="regex"]').each do |match|
39+
offset_str = match['offset'] || '0'
40+
41+
# Parse offset
42+
offset = if offset_str.include?(':')
43+
parts = offset_str.split(':').map(&:to_i)
44+
parts[0]..parts[1]
45+
else
46+
offset_str.to_i
47+
end
48+
49+
regex = Marcel::TikaRegex.to_ruby_regexp(match['value'])
50+
51+
buffer = (+"").encode(Encoding::BINARY)
52+
53+
result = Marcel::Magic.send(:match_regex, StringIO.new("test content that won't match"), offset, regex, buffer)
54+
55+
assert_equal false, result, "Maybe the test string needs to be updated, but it is very unlikely that it matches a file regexp (#{type})"
56+
end
57+
end
58+
end
2859
end

test/tika_regex_test.rb

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
require 'test_helper'
2+
require 'nokogiri'
3+
4+
class TikaRegexTest < Marcel::TestCase
5+
test "converts simple pattern" do
6+
pattern = '^BZh[1-9]'
7+
result = Marcel::TikaRegex.to_ruby_regexp(pattern)
8+
9+
assert_instance_of Regexp, result
10+
assert_equal(/^BZh[1-9]/, result)
11+
end
12+
13+
test "removes multiple dotall flags" do
14+
pattern = '(?s)first(?s)second'
15+
result = Marcel::TikaRegex.to_ruby_regexp(pattern)
16+
17+
assert_instance_of Regexp, result
18+
assert_equal 'firstsecond', result.source
19+
assert_equal Regexp::MULTILINE, result.options & Regexp::MULTILINE
20+
end
21+
22+
test "returns nil for incompatible pattern" do
23+
# Variable-length lookbehind is not supported in Ruby
24+
pattern = '(?<=[\\x00][^\\x00]{0,10})[A-Z]'
25+
result = Marcel::TikaRegex.to_ruby_regexp(pattern)
26+
27+
assert_nil result, "Incompatible pattern should return nil"
28+
end
29+
30+
test "returns nil for nil input" do
31+
result = Marcel::TikaRegex.to_ruby_regexp(nil)
32+
assert_nil result
33+
end
34+
35+
test "returns nil for empty string" do
36+
result = Marcel::TikaRegex.to_ruby_regexp('')
37+
assert_nil result
38+
end
39+
40+
test "handles character class overlaps silently" do
41+
pattern = '[a-zA-Z][A-Za-z0-9_]'
42+
43+
# Capture stderr to check for warnings
44+
old_stderr = $stderr
45+
$stderr = StringIO.new
46+
47+
result = Marcel::TikaRegex.to_ruby_regexp(pattern)
48+
49+
warnings = $stderr.string
50+
$stderr = old_stderr
51+
52+
assert_instance_of Regexp, result
53+
assert_equal '', warnings, "Should not produce warnings"
54+
end
55+
56+
test "handles multiple flags" do
57+
pattern = '(?i)(?s)<html>.*</html>'
58+
result = Marcel::TikaRegex.to_ruby_regexp(pattern)
59+
60+
assert_instance_of Regexp, result
61+
assert result.match?("<HTML>\n</HTML>"), "Should be case-insensitive and multiline"
62+
assert result.match?("<html>\ntest\n</html>"), "Should match content across lines"
63+
end
64+
65+
test "compiles all regex patterns from tika.xml" do
66+
# MIME types with known incompatible patterns
67+
# These patterns use Java-specific regex features not supported by Ruby
68+
ignore_list = %w[application/x-dbf]
69+
70+
doc = Nokogiri::XML(File.new('data/tika.xml'))
71+
patterns_by_type = {}
72+
73+
# Extract all regex patterns from tika.xml
74+
(doc/'mime-info/mime-type').each do |mime|
75+
type = mime['type']
76+
77+
(mime/'magic/match[@type="regex"]').each do |match|
78+
patterns_by_type[type] ||= []
79+
patterns_by_type[type] << match['value']
80+
end
81+
end
82+
83+
patterns_by_type.each do |mime_type, patterns|
84+
patterns.each do |pattern|
85+
next if ignore_list.include?(mime_type)
86+
87+
result = Marcel::TikaRegex.to_ruby_regexp(pattern)
88+
assert_instance_of Regexp, result, "Pattern for #{mime_type} should compile to Regexp: #{pattern}"
89+
end
90+
end
91+
end
92+
end

0 commit comments

Comments
 (0)