From a7b7ea30ba8b916b20f67ca660e4bbc7425c032a Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 9 Nov 2023 12:53:06 +0100
Subject: [PATCH 1/2] fix b2s wide/utf-8 string handling via workaround

---
 floss/language/rust/extract.py | 55 +++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/floss/language/rust/extract.py b/floss/language/rust/extract.py
index f67e57c82..3d3dc3960 100644
--- a/floss/language/rust/extract.py
+++ b/floss/language/rust/extract.py
@@ -4,7 +4,7 @@
 import pathlib
 import argparse
 import itertools
-from typing import List, Tuple, Iterable
+from typing import List, Tuple, Iterable, Optional
 
 import pefile
 import binary2strings as b2s
@@ -25,6 +25,41 @@ def get_rdata_section(pe: pefile.PE) -> pefile.SectionStructure:
     raise ValueError("no .rdata section found")
 
 
+def fix_b2s_wide_strings(
+    strings: List[Tuple[str, str, Tuple[int, int], bool]], min_length: int, buffer: bytes
+) -> List[Tuple[str, str, Tuple[int, int], bool]]:
+    # TODO(mr-tz): b2s may parse wide strings where there really should be utf-8 strings
+    #  handle special cases here until fixed
+    #  https://github.com/mandiant/flare-floss/issues/867
+    fixed_strings: List[Tuple[str, str, Tuple[int, int], bool]] = list()
+    last_fixup: Optional[Tuple[str, str, Tuple[int, int], bool]] = None
+    for string in strings:
+        s = string[0]
+        string_type = string[1]
+        start = string[2][0]
+
+        if string_type == "WIDE_STRING":
+            sd = s.encode("utf-16le", "ignore")
+            # utf-8 strings will not start with \x00
+            if sd[0] == 0:
+                new_string = b2s.extract_string(buffer[start + 1 :])
+                last_fixup = (
+                    new_string[0],
+                    new_string[1],
+                    (new_string[2][0] + start + 1, new_string[2][1] + start + 1),
+                    new_string[3],
+                )
+                if len(last_fixup[0]) < min_length:
+                    last_fixup = None
+        else:
+            if last_fixup and s in last_fixup[0]:
+                fixed_strings.append(last_fixup)
+            else:
+                fixed_strings.append(string)
+            last_fixup = None
+    return fixed_strings
+
+
 def filter_and_transform_utf8_strings(
     strings: List[Tuple[str, str, Tuple[int, int], bool]],
     start_rdata: int,
@@ -46,7 +81,7 @@ def filter_and_transform_utf8_strings(
     return transformed_strings
 
 
-def split_strings(static_strings: List[StaticString], address: int) -> None:
+def split_strings(static_strings: List[StaticString], address: int, min_length: int) -> None:
     """
     if address is in between start and end of a string in ref data then split the string
     this modifies the elements of the static strings list directly
@@ -57,8 +92,12 @@ def split_strings(static_strings: List[StaticString], address: int) -> None:
             rust_string = string.string[0 : address - string.offset]
             rest = string.string[address - string.offset :]
 
-            static_strings.append(StaticString(string=rust_string, offset=string.offset, encoding=StringEncoding.UTF8))
-            static_strings.append(StaticString(string=rest, offset=address, encoding=StringEncoding.UTF8))
+            if len(rust_string) >= min_length:
+                static_strings.append(
+                    StaticString(string=rust_string, offset=string.offset, encoding=StringEncoding.UTF8)
+                )
+            if len(rest) >= min_length:
+                static_strings.append(StaticString(string=rest, offset=address, encoding=StringEncoding.UTF8))
 
             # remove string from static_strings
             for static_string in static_strings:
@@ -97,12 +136,14 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt
     end_rdata = start_rdata + rdata_section.SizeOfRawData
     virtual_address = rdata_section.VirtualAddress
     pointer_to_raw_data = rdata_section.PointerToRawData
+    buffer_rdata = rdata_section.get_data()
 
     # extract utf-8 and wide strings, latter not needed here
-    strings = b2s.extract_all_strings(rdata_section.get_data(), min_length)
+    strings = b2s.extract_all_strings(buffer_rdata, min_length)
+    fixed_strings = fix_b2s_wide_strings(strings, min_length, buffer_rdata)
 
     # select only UTF-8 strings and adjust offset
-    static_strings = filter_and_transform_utf8_strings(strings, start_rdata)
+    static_strings = filter_and_transform_utf8_strings(fixed_strings, start_rdata)
 
     struct_string_addrs = map(lambda c: c.address, get_struct_string_candidates(pe))
 
@@ -126,7 +167,7 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt
         if not (start_rdata <= address < end_rdata):
             continue
 
-        split_strings(static_strings, address)
+        split_strings(static_strings, address, min_length)
 
     return static_strings
 

From 98fbde05a9c657259229f8a2888bc7e19f2f147b Mon Sep 17 00:00:00 2001
From: mr-tz <moritz.raabe@mandiant.com>
Date: Thu, 9 Nov 2023 13:57:14 +0100
Subject: [PATCH 2/2] enable test

---
 tests/test_language_extract_rust.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_language_extract_rust.py b/tests/test_language_extract_rust.py
index f507efe75..9d210ce0c 100644
--- a/tests/test_language_extract_rust.py
+++ b/tests/test_language_extract_rust.py
@@ -30,12 +30,11 @@ def rust_strings64():
         # .rdata:00000001400BD040 30 D0 0B 40 01 00 pieces          ___str_ <offset aHelloWorld, 0Eh>
         # .rdata:00000001400BD040 00 00 00 00                                               ; "Hello, world!\n"
         pytest.param("Hello, world!", 0xBB030, StringEncoding.UTF8, "rust_strings64"),
-        # TODO enable, see issue #867
         # .rdata:00000001400BD050 69 6E 76 61 6C 69 aInvalidArgs    db 'invalid args',0
         # .rdata:00000001400BD05D 00 00 00                          align 20h
         # .rdata:00000001400BD060 50 D0 0B 40 01 00 stru_1400BD060  ___str_ <offset aInvalidArgs, 0Ch>
         # .rdata:00000001400BD060 00 00 00 00                                               ; "invalid args"
-        # pytest.param("invalid args", 0xBB050, StringEncoding.UTF8, "rust_strings64"),
+        pytest.param("invalid args", 0xBB050, StringEncoding.UTF8, "rust_strings64"),
     ],
 )
 def test_data_string_offset(request, string, offset, encoding, rust_strings):