1
1
import mmap
2
+ import io
2
3
import yara
3
4
4
5
from polydet .polyglot_level import PolyglotLevel
8
9
RULES = """
9
10
rule IsHTML {
10
11
strings:
11
- $doctype = /<!DOCTYPE html>/
12
- $opening_tag = /<(html|body|script)/
12
+ $doctype = /<!DOCTYPE html/ nocase
13
+ $opening_tag = /<(html|body|script)/ nocase
13
14
14
15
condition:
15
16
$doctype or $opening_tag
@@ -25,9 +26,7 @@ def check(filename):
25
26
return check_with_matches (filename , {m .rule : m for m in matches })
26
27
27
28
28
- # TODO Add acceptance for whitespace at beginning
29
- # TODO Check lowercase doctypes
30
- # TODO Check unclosed tags as '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' or '<html onload="">'.
29
+ # TODO Support uppercase and random case doctype and tags
31
30
# Can be done with ungreedy matching in yara
32
31
# TODO Improve to use results of yara matching
33
32
def check_with_matches (filename : str , matches ):
@@ -56,8 +55,12 @@ def check_with_matches(filename: str, matches):
56
55
break
57
56
if doc_start != - 1 :
58
57
level = PolyglotLevel ()
59
- if doc_start != 0 :
58
+
59
+ buf .seek (0 , io .SEEK_SET )
60
+ begin_content = buf .read (doc_start ) # Read until doc start
61
+ if not __is_whitespace (begin_content ):
60
62
level .add_chunk (0 , doc_start )
63
+
61
64
buf .seek (doc_end )
62
65
contents = buf .read ()
63
66
if not __is_whitespace (contents ):
@@ -68,7 +71,5 @@ def check_with_matches(filename: str, matches):
68
71
69
72
70
73
def __is_whitespace (contents : bytes ):
71
- for elem in contents :
72
- if elem != ord (' ' ) and elem != ord ('\t ' ) and elem != ord ('\n ' ):
73
- return False
74
- return True
74
+ whitespaces = b' \t \r \n '
75
+ return all (b in whitespaces for b in contents )
0 commit comments