datalake: parser for protobuf message offset list

jcipar · jcipar · commit 5a3b74932967 · 2024-10-01T13:39:05.000-04:00
Protobuf records have an additional prefix after the schema id. Since a
protobuf schema can contain multiple message types, and those messages
may be nested, they contain a list of message ids to be traversed to
find the actual message schema for this record. These are encoded as a
varint representing the length followed by a list of varints.
diff --git a/src/v/datalake/schema_registry.cc b/src/v/datalake/schema_registry.cc
@@ -9,10 +9,11 @@
  */
 #include "datalake/schema_registry.h"
 
+#include "bytes/iobuf_parser.h"
 #include "pandaproxy/schema_registry/types.h"
 
 namespace datalake {
-get_schema_id_result get_value_schema_id(const iobuf& buf) {
+get_schema_id_result get_value_schema_id(iobuf& buf) {
     // Messages that use the schema registry have a 5-byte prefix:
     // offset 0: magic byte with value 0
     // offsets 1-4: schema ID as big-endian integer
@@ -27,7 +28,55 @@ get_schema_id_result get_value_schema_id(const iobuf& buf) {
         return get_schema_error::no_schema_id;
     }
     auto id = parser.consume_be_type<int32_t>();
-    return pandaproxy::schema_registry::schema_id{id};
+    schema_message_data res = {
+      .schema_id = pandaproxy::schema_registry::schema_id{id},
+      .shared_message_data = buf.share(
+        parser.bytes_consumed(), parser.bytes_left())};
+
+    return res;
+}
+
+// TODO: this is mostly a copy-and-paste of get_proto_offsets from
+// pandaproxy::schema_registry with a slightly different interface. Unify these.
+get_proto_offsets_result get_proto_offsets(iobuf& buf) {
+    auto header = get_value_schema_id(buf);
+    if (!header.has_value()) {
+        return header.error();
+    }
+    proto_schema_message_data result;
+    result.schema_id = header.value().schema_id;
+    iobuf_const_parser parser(header.value().shared_message_data);
+
+    // The encoding is a length, followed by indexes into the file or message.
+    // Each number is a zigzag encoded integer.
+    auto [offset_count, bytes_read] = parser.read_varlong();
+    if (!bytes_read) {
+        return get_schema_error::bad_varint;
+    }
+    // Reject more offsets than bytes remaining; it's not possible
+    if (static_cast<size_t>(offset_count) > parser.bytes_left()) {
+        return get_schema_error::not_enough_bytes;
+    }
+    if (offset_count == 0) {
+        result.protobuf_offsets.push_back(0);
+        result.shared_message_data = header.value().shared_message_data.share(
+          parser.bytes_consumed(), parser.bytes_left());
+        return result;
+    }
+    result.protobuf_offsets.resize(offset_count);
+    for (auto& o : result.protobuf_offsets) {
+        if (parser.bytes_left() == 0) {
+            return get_schema_error::not_enough_bytes;
+        }
+        std::tie(o, bytes_read) = parser.read_varlong();
+        if (!bytes_read) {
+            return get_schema_error::bad_varint;
+        }
+    }
+
+    result.shared_message_data = header.value().shared_message_data.share(
+      parser.bytes_consumed(), parser.bytes_left());
+    return result;
 }
 
 } // namespace datalake
diff --git a/src/v/datalake/schema_registry.h b/src/v/datalake/schema_registry.h
@@ -18,10 +18,26 @@ class iobuf;
 
 namespace datalake {
 
+// Stores a pair of schema id and the remaining message data after the schema
+// prefix. The message data is shared from the original iobuf.
+struct schema_message_data {
+    pandaproxy::schema_registry::schema_id schema_id;
+    iobuf shared_message_data;
+};
+
+// Stores the schema_id, protobuf message offsets, and remaining message data.
+// The message data is shared from the original iobuf.
+struct proto_schema_message_data {
+    pandaproxy::schema_registry::schema_id schema_id;
+    std::vector<int32_t> protobuf_offsets;
+    iobuf shared_message_data;
+};
+
 enum class get_schema_error {
     ok = 0,
     no_schema_id,
     not_enough_bytes,
+    bad_varint,
 };
 
 struct get_schema_error_category : std::error_category {
@@ -35,6 +51,8 @@ struct get_schema_error_category : std::error_category {
             return "No schema ID";
         case get_schema_error::not_enough_bytes:
             return "Not enough bytes in message";
+        case get_schema_error::bad_varint:
+            return "Bad encoded value for varint";
         }
     }
 
@@ -48,13 +66,15 @@ inline std::error_code make_error_code(get_schema_error e) noexcept {
     return {static_cast<int>(e), get_schema_error_category::error_category()};
 }
 
-using get_schema_id_result
-  = result<pandaproxy::schema_registry::schema_id, get_schema_error>;
+using get_schema_id_result = result<schema_message_data, get_schema_error>;
+using get_proto_offsets_result
+  = result<proto_schema_message_data, get_schema_error>;
 
 // Extract the schema id from a record's value. This simply extracts the id. It
 // does not do any validation. Returns std::nullopt if the record does not have
 // a schema id.
-get_schema_id_result get_value_schema_id(const iobuf& record);
+get_schema_id_result get_value_schema_id(iobuf& record);
+get_proto_offsets_result get_proto_offsets(iobuf& record);
 
 } // namespace datalake
 
diff --git a/src/v/datalake/tests/schema_registry_test.cc b/src/v/datalake/tests/schema_registry_test.cc
@@ -11,6 +11,7 @@
 #include "bytes/iobuf.h"
 #include "datalake/schema_registry.h"
 #include "model/record.h"
+#include "utils/vint.h"
 
 #include <gtest/gtest.h>
 
@@ -22,12 +23,15 @@ TEST(DatalakeSchemaRegistry, SchemaIdForValidRecord) {
     uint8_t magic = 0;
     int32_t schema_id = 12;
     int32_t schema_id_encoded = htobe32(schema_id);
+    std::string payload = "Hello world";
     value.append(&magic, 1);
     value.append(reinterpret_cast<uint8_t*>(&schema_id_encoded), 4);
+    value.append(payload.data(), payload.size());
 
     auto res = datalake::get_value_schema_id(value);
     ASSERT_TRUE(res.has_value());
-    EXPECT_EQ(res.value()(), schema_id);
+    EXPECT_EQ(res.value().schema_id(), schema_id);
+    EXPECT_EQ(res.value().shared_message_data.size_bytes(), payload.size());
 }
 
 TEST(DatalakeSchemaRegistry, SchemaIdForShortRecord) {
@@ -57,3 +61,86 @@ TEST(DatalakeSchemaRegistry, SchemaIdForBadMagic) {
     ASSERT_TRUE(res.has_error());
     EXPECT_EQ(res.error(), datalake::get_schema_error::no_schema_id);
 }
+
+TEST(DatalakeSchemaRegistry, GetProtoOffsetsOk) {
+    iobuf value;
+
+    uint8_t magic = 0; // Invalid magic
+    uint32_t schema_id = 12;
+    int32_t schema_id_encoded = htobe32(schema_id);
+    std::string payload = "Hello world";
+    value.append(&magic, 1);
+    value.append(reinterpret_cast<uint8_t*>(&schema_id_encoded), 4);
+
+    uint8_t proto_msg_count = 9;
+    std::array<uint8_t, 16> encoded;
+    size_t encoded_size = vint::serialize(proto_msg_count, encoded.data());
+    value.append(encoded.data(), encoded_size);
+
+    for (uint8_t i = 0; i < proto_msg_count; i++) {
+        encoded_size = vint::serialize(i, encoded.data());
+        value.append(encoded.data(), encoded_size);
+    }
+    value.append(payload.data(), payload.size());
+
+    auto res = datalake::get_proto_offsets(value);
+    ASSERT_TRUE(res.has_value());
+    const auto& offsets = res.value().protobuf_offsets;
+    EXPECT_EQ(offsets.size(), proto_msg_count);
+    for (int32_t o = 0; o < offsets.size(); o++) {
+        EXPECT_EQ(o, offsets[o]);
+    }
+    EXPECT_EQ(res.value().schema_id(), schema_id);
+    EXPECT_EQ(res.value().shared_message_data.size_bytes(), payload.size());
+}
+
+TEST(DatalakeSchemaRegistry, GetProtoOffsetsDefaultZero) {
+    // This tests a special case where the offset count is 0, we should assume
+    // that the message is the first one defined in the schema and return {0}.
+    iobuf value;
+
+    uint8_t magic = 0; // Invalid magic
+    uint32_t schema_id = 12;
+    int32_t schema_id_encoded = htobe32(schema_id);
+    std::string payload = "Hello world";
+    value.append(&magic, 1);
+    value.append(reinterpret_cast<uint8_t*>(&schema_id_encoded), 4);
+
+    uint8_t proto_msg_count = 0;
+    std::array<uint8_t, 16> encoded;
+    size_t encoded_size = vint::serialize(proto_msg_count, encoded.data());
+    value.append(encoded.data(), encoded_size);
+    value.append(payload.data(), payload.size());
+
+    auto res = datalake::get_proto_offsets(value);
+    ASSERT_TRUE(res.has_value());
+    const auto& offsets = res.value().protobuf_offsets;
+    EXPECT_EQ(offsets.size(), 1);
+    EXPECT_EQ(offsets[0], 0);
+    EXPECT_EQ(res.value().schema_id(), schema_id);
+    EXPECT_EQ(res.value().shared_message_data.size_bytes(), payload.size());
+}
+
+TEST(DatalakeSchemaRegistry, GetProtoOffsetsNotEnoughData) {
+    iobuf value;
+
+    uint8_t magic = 0; // Invalid magic
+    uint32_t schema_id = 12;
+    int32_t schema_id_encoded = htobe32(schema_id);
+    value.append(&magic, 1);
+    value.append(reinterpret_cast<uint8_t*>(&schema_id_encoded), 4);
+
+    uint8_t proto_msg_count = 9;
+    std::array<uint8_t, 16> encoded;
+    size_t encoded_size = vint::serialize(proto_msg_count, encoded.data());
+    value.append(encoded.data(), encoded_size);
+
+    for (uint8_t i = 0; i < proto_msg_count - 1; i++) {
+        encoded_size = vint::serialize(i, encoded.data());
+        value.append(encoded.data(), encoded_size);
+    }
+
+    auto res = datalake::get_proto_offsets(value);
+    ASSERT_TRUE(res.has_error());
+    EXPECT_EQ(res.error(), datalake::get_schema_error::not_enough_bytes);
+}