Merge pull request mandiant#897 from mr-tz/workaround-fix-b2s-wide-st…

…rings fix b2s wide/utf-8 string handling via workaround
Arker123 · Nov 9, 2023 · 9405cb8 · 9405cb8
2 parents 52747a4 + 98fbde0
commit 9405cb8
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 9 deletions.
diff --git a/floss/language/rust/extract.py b/floss/language/rust/extract.py
@@ -4,7 +4,7 @@
 import pathlib
 import argparse
 import itertools
-from typing import List, Tuple, Iterable
+from typing import List, Tuple, Iterable, Optional
 
 import pefile
 import binary2strings as b2s
@@ -25,6 +25,41 @@ def get_rdata_section(pe: pefile.PE) -> pefile.SectionStructure:
     raise ValueError("no .rdata section found")
 
 
+def fix_b2s_wide_strings(
+    strings: List[Tuple[str, str, Tuple[int, int], bool]], min_length: int, buffer: bytes
+) -> List[Tuple[str, str, Tuple[int, int], bool]]:
+    # TODO(mr-tz): b2s may parse wide strings where there really should be utf-8 strings
+    #  handle special cases here until fixed
+    #  https://github.com/mandiant/flare-floss/issues/867
+    fixed_strings: List[Tuple[str, str, Tuple[int, int], bool]] = list()
+    last_fixup: Optional[Tuple[str, str, Tuple[int, int], bool]] = None
+    for string in strings:
+        s = string[0]
+        string_type = string[1]
+        start = string[2][0]
+
+        if string_type == "WIDE_STRING":
+            sd = s.encode("utf-16le", "ignore")
+            # utf-8 strings will not start with \x00
+            if sd[0] == 0:
+                new_string = b2s.extract_string(buffer[start + 1 :])
+                last_fixup = (
+                    new_string[0],
+                    new_string[1],
+                    (new_string[2][0] + start + 1, new_string[2][1] + start + 1),
+                    new_string[3],
+                )
+                if len(last_fixup[0]) < min_length:
+                    last_fixup = None
+        else:
+            if last_fixup and s in last_fixup[0]:
+                fixed_strings.append(last_fixup)
+            else:
+                fixed_strings.append(string)
+            last_fixup = None
+    return fixed_strings
+
+
 def filter_and_transform_utf8_strings(
     strings: List[Tuple[str, str, Tuple[int, int], bool]],
     start_rdata: int,
@@ -46,7 +81,7 @@ def filter_and_transform_utf8_strings(
     return transformed_strings
 
 
-def split_strings(static_strings: List[StaticString], address: int) -> None:
+def split_strings(static_strings: List[StaticString], address: int, min_length: int) -> None:
     """
     if address is in between start and end of a string in ref data then split the string
     this modifies the elements of the static strings list directly
@@ -57,8 +92,12 @@ def split_strings(static_strings: List[StaticString], address: int) -> None:
             rust_string = string.string[0 : address - string.offset]
             rest = string.string[address - string.offset :]
 
-            static_strings.append(StaticString(string=rust_string, offset=string.offset, encoding=StringEncoding.UTF8))
-            static_strings.append(StaticString(string=rest, offset=address, encoding=StringEncoding.UTF8))
+            if len(rust_string) >= min_length:
+                static_strings.append(
+                    StaticString(string=rust_string, offset=string.offset, encoding=StringEncoding.UTF8)
+                )
+            if len(rest) >= min_length:
+                static_strings.append(StaticString(string=rest, offset=address, encoding=StringEncoding.UTF8))
 
             # remove string from static_strings
             for static_string in static_strings:
@@ -97,12 +136,14 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt
     end_rdata = start_rdata + rdata_section.SizeOfRawData
     virtual_address = rdata_section.VirtualAddress
     pointer_to_raw_data = rdata_section.PointerToRawData
+    buffer_rdata = rdata_section.get_data()
 
     # extract utf-8 and wide strings, latter not needed here
-    strings = b2s.extract_all_strings(rdata_section.get_data(), min_length)
+    strings = b2s.extract_all_strings(buffer_rdata, min_length)
+    fixed_strings = fix_b2s_wide_strings(strings, min_length, buffer_rdata)
 
     # select only UTF-8 strings and adjust offset
-    static_strings = filter_and_transform_utf8_strings(strings, start_rdata)
+    static_strings = filter_and_transform_utf8_strings(fixed_strings, start_rdata)
 
     struct_string_addrs = map(lambda c: c.address, get_struct_string_candidates(pe))
 
@@ -126,7 +167,7 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt
         if not (start_rdata <= address < end_rdata):
             continue
 
-        split_strings(static_strings, address)
+        split_strings(static_strings, address, min_length)
 
     return static_strings
 

diff --git a/tests/test_language_extract_rust.py b/tests/test_language_extract_rust.py
@@ -30,12 +30,11 @@ def rust_strings64():
         # .rdata:00000001400BD040 30 D0 0B 40 01 00 pieces          ___str_ <offset aHelloWorld, 0Eh>
         # .rdata:00000001400BD040 00 00 00 00                                               ; "Hello, world!\n"
         pytest.param("Hello, world!", 0xBB030, StringEncoding.UTF8, "rust_strings64"),
-        # TODO enable, see issue #867
         # .rdata:00000001400BD050 69 6E 76 61 6C 69 aInvalidArgs    db 'invalid args',0
         # .rdata:00000001400BD05D 00 00 00                          align 20h
         # .rdata:00000001400BD060 50 D0 0B 40 01 00 stru_1400BD060  ___str_ <offset aInvalidArgs, 0Ch>
         # .rdata:00000001400BD060 00 00 00 00                                               ; "invalid args"
-        # pytest.param("invalid args", 0xBB050, StringEncoding.UTF8, "rust_strings64"),
+        pytest.param("invalid args", 0xBB050, StringEncoding.UTF8, "rust_strings64"),
     ],
 )
 def test_data_string_offset(request, string, offset, encoding, rust_strings):