Skip to content

Commit 73165bd

Browse files
committed
add tika regex support
This will work for simple things only because we're using a different regex engine. But out of all the current regular expressions, only the one for `application/x-dbf` fails. So I guess we're good. And we can get rid of that html definition now.
1 parent 3d3c5dc commit 73165bd

File tree

10 files changed

+412
-27
lines changed

10 files changed

+412
-27
lines changed

data/custom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
11
<mime-info>
2+
<!-- Override audio/mpeg pattern to anchor it at file start -->
3+
<mime-type type="audio/mpeg">
4+
<magic priority="30">
5+
<!-- Anchored version: must match at file start, not mid-buffer -->
6+
<match value="\\A(?:\\x0D\\x0A|\\x00{1,1024})(?:\\xff[\\xe3\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff]|ID3)" type="regex" offset="0"/>
7+
</magic>
8+
</mime-type>
9+
210
<mime-type type="image/svg+xml">
311
<sub-class-of type="application/xml" />
412

lib/marcel.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
module Marcel
44
require "marcel/version"
5+
require "marcel/tika_regex"
56
require "marcel/magic"
67
require "marcel/mime_type"
78
end

lib/marcel/magic.rb

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,14 @@ def self.magic_match(io, method)
126126

127127
def self.magic_match_io(io, matches, buffer)
128128
matches.any? do |offset, value, children|
129+
# Skip if value is nil (e.g., invalid regex pattern - it was meant for Java after all)
130+
next false if value.nil?
131+
129132
match =
130133
if value
131-
if Range === offset
134+
if value.is_a?(Regexp)
135+
match_regex(io, offset, value, buffer)
136+
elsif Range === offset
132137
io.read(offset.begin, buffer)
133138
x = io.read(offset.end - offset.begin + value.bytesize, buffer)
134139
x && x.include?(value)
@@ -143,6 +148,33 @@ def self.magic_match_io(io, matches, buffer)
143148
end
144149
end
145150

146-
private_class_method :magic_match, :magic_match_io
151+
def self.match_regex(io, offset, regexp, buffer)
152+
start = offset.is_a?(Range) ? offset.begin : offset
153+
io.read(start, buffer) if start > 0
154+
data = io.read(256, buffer)
155+
return false unless data
156+
157+
# I know, I know... this is awful, but the patterns come from Apache Tika
158+
# and we are getting warnings about character class overlaps, so we'll
159+
# suppress warnings for this match call.
160+
# I'm open to better ideas.
161+
begin
162+
old_verbose = $VERBOSE
163+
$VERBOSE = nil
164+
165+
# For regex patterns, simply match within the data buffer
166+
# The patterns themselves should be designed to match appropriately
167+
data.match?(regexp)
168+
ensure
169+
$VERBOSE = old_verbose
170+
end
171+
172+
# we need to catch all exceptions here because TruffleRuby raises Polyglot::ForeignException
173+
rescue Exception => e
174+
warn "Marcel::Magic.match_regex: error matching #{regexp.inspect}: #{e.message}"
175+
false
176+
end
177+
178+
private_class_method :magic_match, :magic_match_io, :match_regex
147179
end
148180
end

lib/marcel/mime_type/definitions.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# frozen_string_literal: true
22

33
Marcel::MimeType.extend "text/plain", extensions: %w( txt asc )
4-
Marcel::MimeType.extend "text/html", magic: [[0..64, "<!DOCTYPE HTML"], [0..64, "<!DOCTYPE html"], [0..64, "<!doctype HTML"], [0..64, "<!doctype html"]]
54

65
Marcel::MimeType.extend "application/illustrator", parents: "application/pdf"
76
Marcel::MimeType.extend "image/vnd.adobe.photoshop", magic: [[0, "8BPS"]], extensions: %w( psd psb )

lib/marcel/tables.rb

Lines changed: 26 additions & 24 deletions
Large diffs are not rendered by default.

lib/marcel/tika_regex.rb

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# frozen_string_literal: true
2+
3+
module Marcel
4+
module TikaRegex
5+
# Apache Tika uses Java regex syntax, which has some differences from Ruby:
6+
# - (?s) flag in Java is a mode which makes . match newlines
7+
# In Ruby, this is equivalent to the multiline flag
8+
# - Java uses double-escaped sequences like \\d, \\x00, \\u0041 in XML
9+
# These need to be converted to Ruby's single-escaped format: \d, \x00, \u0041
10+
# - Naturally, some Java regex features are not supported in Ruby (e.g., variable-length lookbehinds)
11+
#
12+
# This method handles the conversion and gracefully returns nil for incompatible patterns.
13+
#
14+
# @param pattern [String] The Tika regex pattern string
15+
# @return [Regexp, nil] The compiled Ruby Regexp, or nil if the pattern is incompatible
16+
def self.to_ruby_regexp(pattern)
17+
return nil if pattern.nil? || pattern.empty?
18+
19+
processed = pattern.dup
20+
flags = 0
21+
22+
# Converting Java's (?s) dotall flag to Ruby's multiline
23+
if processed.include?('(?s)')
24+
processed = processed.gsub('(?s)', '')
25+
flags |= Regexp::MULTILINE
26+
end
27+
28+
# Convert Java-style double-escaped sequences to Ruby single-escaped format
29+
# This is more complex than a simple gsub because we need to handle:
30+
# - \\xHH -> \xHH (hex byte)
31+
# - \\uHHHH -> \uHHHH (unicode)
32+
# - \\d, \\w, \\s, etc. -> \d, \w, \s (character classes)
33+
# - \\[, \\], \\{, \\}, etc. -> \[, \], \{, \} (literal characters)
34+
#
35+
# We process these specifically to avoid breaking the regex structure
36+
processed = processed.gsub(/\\\\(x[0-9a-fA-F]{2})/, '\\\\\1') # \\xHH -> \xHH
37+
.gsub(/\\\\(u[0-9a-fA-F]{4})/, '\\\\\1') # \\uHHHH -> \uHHHH
38+
.gsub(/\\\\([0-7]{1,3})/, '\\\\\1') # \\OOO -> \OOO (octal)
39+
.gsub(/\\\\([WDS])/i, '\\\\\1') # \\d etc. -> \d
40+
.gsub(/\\\\([farbentv])/, '\\\\\1') # \\n etc. -> \n
41+
.gsub(/\\\\([()\[\]{}|*+?.^$\\])/, '\\\\\1') # \\[ etc. -> \[
42+
43+
# Force binary encoding to handle binary escape sequences like \xff
44+
processed = processed.force_encoding(Encoding::BINARY)
45+
46+
# I know, I know... this is awful, but the patterns come from Apache Tika
47+
# and we are getting warnings about character class overlaps, so we'll
48+
# suppress warnings for this Regexp compilation.
49+
# I'm open to better ideas.
50+
old_verbose = $VERBOSE
51+
$VERBOSE = nil
52+
53+
Regexp.new(processed, flags).freeze
54+
rescue RegexpError
55+
nil
56+
ensure
57+
$VERBOSE = old_verbose
58+
end
59+
end
60+
end

script/generate_tables.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# Copyright (c) 2011 Daniel Mendler. Available at https://github.com/mimemagicrb/mimemagic.
55

66
require 'nokogiri'
7+
require_relative '../lib/marcel/tika_regex'
78

89
class String
910
alias inspect_old inspect
@@ -27,6 +28,16 @@ def inspect
2728
end
2829
end
2930

31+
class RegexString
32+
def initialize(pattern)
33+
@pattern = pattern
34+
end
35+
36+
def inspect
37+
"r[#{@pattern.inspect}]"
38+
end
39+
end
40+
3041
def str2int(s)
3142
return s.to_i(16) if s[0..1].downcase == '0x'
3243
return s.to_i(8) if s[0..0].downcase == '0'
@@ -39,6 +50,8 @@ def binary_strings(object)
3950
object.map { |o| binary_strings(o) }
4051
when String
4152
BinaryString.new(object)
53+
when RegexString
54+
object
4255
when Numeric, Range, nil
4356
object
4457
else
@@ -65,6 +78,8 @@ def get_matches(mime, parent)
6578

6679
offset = offset.size == 2 ? offset[0]..offset[1] : offset[0]
6780
case type
81+
when 'regex'
82+
value = RegexString.new(value)
6883
when 'string', 'stringignorecase'
6984
value.gsub!(/\A0x([0-9a-f]+)\z/i) { [$1].pack('H*') }
7085
value.gsub!(/\\(x[\dA-Fa-f]{1,2}|0\d{1,3}|\d{1,3}|.)/) { eval("\"\\#{$1}\"") }
@@ -231,6 +246,7 @@ def get_matches(mime, parent)
231246
end
232247
puts " }"
233248
puts " b = Hash.new { |h, k| h[k] = k.b.freeze }"
249+
puts " r = Hash.new { |h, k| h[k] = Marcel::TikaRegex.to_ruby_regexp(k) }"
234250
puts " # @private"
235251
puts " # :nodoc:"
236252
puts " MAGIC = ["
14 Bytes
Binary file not shown.

test/magic_test.rb

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,134 @@ class Marcel::MimeType::MagicTest < Marcel::TestCase
2525
assert Marcel::Magic.child?('text/csv', 'text/plain')
2626
refute Marcel::Magic.child?('text/plain', 'text/csv')
2727
end
28+
29+
test "none of the regex patterns should match random test data" do
30+
ignore_list = %w( application/x-dbf )
31+
32+
extract_regexes = lambda do |matching_rules, collected = []|
33+
matching_rules.each do |offset, value, children|
34+
collected << [offset, value] if value.is_a?(Regexp)
35+
extract_regexes.call(children, collected) if children
36+
end
37+
collected
38+
end
39+
40+
# Use a test string that's very unlikely to match any file format regex
41+
# Using only high Unicode characters and very specific patterns
42+
test_data = "🇨🇭 \xFF\xFE\x03\x05\x06🧀 cheese\x06\x07\x03"
43+
44+
Marcel::MAGIC.each do |type, matching_rules|
45+
next if ignore_list.include?(type)
46+
regexes = extract_regexes.call(matching_rules)
47+
48+
regexes.each do |offset, regex|
49+
buffer = (+"").encode(Encoding::BINARY)
50+
51+
result = Marcel::Magic.send(:match_regex, StringIO.new(test_data), offset, regex, buffer)
52+
53+
assert_equal false, result, "Test data unexpectedly matched a file format regexp (#{type}, #{regex.inspect})"
54+
end
55+
end
56+
end
57+
58+
test "nested match: parent AND child must both match" do
59+
# Rule: offset 0 matches "AAA" AND offset 3 matches "BBB"
60+
# This should match "AAABBB" but not "AAA" alone
61+
test_rules = [
62+
[0, "AAA".b, [[3, "BBB".b]]]
63+
]
64+
65+
buffer = (+"").encode(Encoding::BINARY)
66+
67+
# Should match when both parent and child match
68+
io1 = StringIO.new("AAABBB")
69+
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
70+
"Should match when parent and child both match"
71+
72+
# Should NOT match when parent matches but child doesn't
73+
io2 = StringIO.new("AAAXXX")
74+
refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
75+
"Should not match when parent matches but child doesn't"
76+
end
77+
78+
test "sibling matches use OR logic" do
79+
# Two sibling rules: either can match
80+
# Rule 1: offset 0 matches "XXX"
81+
# Rule 2: offset 0 matches "YYY"
82+
test_rules = [
83+
[0, "XXX".b],
84+
[0, "YYY".b]
85+
]
86+
87+
buffer = (+"").encode(Encoding::BINARY)
88+
89+
# Should match via first sibling
90+
io1 = StringIO.new("XXX")
91+
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
92+
"Should match via first sibling rule"
93+
94+
# Should match via second sibling
95+
io2 = StringIO.new("YYY")
96+
assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
97+
"Should match via second sibling rule"
98+
99+
# Should NOT match when no sibling matches
100+
io3 = StringIO.new("ZZZ")
101+
refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer),
102+
"Should not match when no sibling rule matches"
103+
end
104+
105+
test "parent with multiple child alternatives (OR)" do
106+
# Test complex nested structure: parent AND (child1 OR child2)
107+
# Parent at offset 0 matches "ROOT"
108+
# Child option 1: offset 4 matches "OPT1"
109+
# Child option 2: offset 4 matches "OPT2"
110+
test_rules = [
111+
[0, "ROOT".b, [
112+
[4, "OPT1".b], # First child option
113+
[4, "OPT2".b] # Second child option (sibling OR)
114+
]]
115+
]
116+
117+
buffer = (+"").encode(Encoding::BINARY)
118+
119+
# Should match when parent and first child match
120+
io1 = StringIO.new("ROOTOPT1")
121+
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
122+
"Should match when parent and first child match"
123+
124+
# Should match when parent and second child match
125+
io2 = StringIO.new("ROOTOPT2")
126+
assert Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
127+
"Should match when parent and second child match"
128+
129+
# Should NOT match when parent matches but no child matches
130+
io3 = StringIO.new("ROOTXXXX")
131+
refute Marcel::Magic.send(:magic_match_io, io3, test_rules, buffer),
132+
"Should not match when parent matches but no child matches"
133+
end
134+
135+
test "complex nested structure with multiple levels" do
136+
# Parent AND (Child AND Grandchild)
137+
# offset 0: "AAA", offset 3: "BBB", offset 6: "CCC"
138+
test_rules = [
139+
[0, "AAA".b, [
140+
[3, "BBB".b, [
141+
[6, "CCC".b]
142+
]]
143+
]]
144+
]
145+
146+
buffer = (+"").encode(Encoding::BINARY)
147+
148+
# Should match when all levels match
149+
io1 = StringIO.new("AAABBBCCC")
150+
assert Marcel::Magic.send(:magic_match_io, io1, test_rules, buffer),
151+
"Should match when all nested levels match"
152+
153+
# Should NOT match when grandchild doesn't match
154+
io2 = StringIO.new("AAABBBXXX")
155+
refute Marcel::Magic.send(:magic_match_io, io2, test_rules, buffer),
156+
"Should not match when deepest child doesn't match"
157+
end
28158
end

0 commit comments

Comments
 (0)