Implement column type deduction and ColumnTypedData

Desbordante · Mar 25, 2022 · 90570f1 · 90570f1
1 parent 8fa4650
commit 90570f1
Show file tree

Hide file tree

Showing 4 changed files with 442 additions and 0 deletions.
diff --git a/src/model/ColumnLayoutTypedRelationData.cpp b/src/model/ColumnLayoutTypedRelationData.cpp
@@ -0,0 +1,64 @@
+#include "ColumnLayoutTypedRelationData.h"
+
+#include <easylogging++.h>
+
+namespace model {
+
+std::unique_ptr<ColumnLayoutTypedRelationData> ColumnLayoutTypedRelationData::CreateFrom(
+    CSVParser& file_input, bool is_null_eq_null, int max_cols, long max_rows) {
+    auto schema = std::make_unique<RelationalSchema>(file_input.GetRelationName(), is_null_eq_null);
+    int num_columns = file_input.GetNumberOfColumns();
+
+    if (max_cols > 0) {
+        num_columns = std::min(num_columns, max_cols);
+    }
+
+    std::vector<std::vector<std::string>> columns(num_columns);
+    int row_num = 0;
+    std::vector<std::string> row;
+
+    /* Parsing is very similar to ColumnLayoutRelationData::CreateFrom().
+     * Maybe we need column-based parsing in addition to row-based in CSVParser */
+    while (file_input.GetHasNext()) {
+        row = file_input.ParseNext();
+
+        if (row.empty() && num_columns == 1) {
+            row.emplace_back("");
+        } else if ((int)row.size() != num_columns) {
+            LOG(WARNING) << "Skipping incomplete rows";
+            continue;
+        }
+
+        if (max_rows <= 0 || row_num < max_rows) {
+            int index = 0;
+            for (std::string& field : row) {
+                columns[index].push_back(std::move(field));
+                index++;
+                if (index >= num_columns) {
+                    break;
+                }
+            }
+        } else {
+            LOG(WARNING) << "Processed " << row_num << " rows and " << max_rows - row_num
+                         << " rows remain unprocessed due to `max_rows` parameter.";
+            break;
+        }
+        row_num++;
+    }
+
+    std::vector<TypedColumnData> column_data;
+    for (int i = 0; i < num_columns; ++i) {
+        Column column(schema.get(), file_input.GetColumnName(i), i);
+        schema->AppendColumn(std::move(column));
+        TypedColumnData typed_column_data = model::TypedColumnDataFactory::CreateFrom(
+            schema->GetColumn(i), std::move(columns[i]), is_null_eq_null);
+        column_data.emplace_back(std::move(typed_column_data));
+    }
+
+    schema->Init();
+
+    return std::make_unique<ColumnLayoutTypedRelationData>(std::move(schema),
+                                                           std::move(column_data));
+}
+
+}  // namespace model
diff --git a/src/model/ColumnLayoutTypedRelationData.h b/src/model/ColumnLayoutTypedRelationData.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "CSVParser.h"
+#include "RelationData.h"
+#include "TypedColumnData.h"
+
+namespace model {
+
+using TypedRelationData = AbstractRelationData<TypedColumnData>;
+
+class ColumnLayoutTypedRelationData final : public TypedRelationData {
+public:
+    using TypedRelationData::AbstractRelationData;
+
+    unsigned int GetNumRows() const final {
+        if (column_data_.empty()) {
+            return 0;
+        } else {
+            return column_data_.front().GetNumRows();
+        }
+    }
+
+    static std::unique_ptr<ColumnLayoutTypedRelationData> CreateFrom(CSVParser& file_input,
+                                                                     bool is_null_eq_null,
+                                                                     int max_cols = -1,
+                                                                     long max_rows = -1);
+};
+
+}  // namespace model
diff --git a/src/model/TypedColumnData.cpp b/src/model/TypedColumnData.cpp
@@ -0,0 +1,172 @@
+#include "TypedColumnData.h"
+
+#include "CreateType.h"
+
+namespace model {
+
+TypedColumnDataFactory::TypeMap TypedColumnDataFactory::CreateTypeMap() const {
+    TypeMap type_map;
+    auto const match = [&type_map](std::string const& val, size_t const row) {
+        bool matched = false;
+        for (auto const& [type_id, regex] : type_id_to_regex_) {
+            if (std::regex_match(val, regex)) {
+                type_map[type_id].insert(row);
+                matched = true;
+                break;
+            }
+        }
+        if (!matched) {
+            type_map[TypeId::kString].insert(row);
+        }
+    };
+
+    for (size_t i = 0; i != unparsed_.size(); ++i) {
+        match(unparsed_[i], i);
+    }
+
+    if (type_map.count(TypeId::kBigInt) && type_map.count(TypeId::kInt)) {
+        std::unordered_set<unsigned>& big_ints = type_map[TypeId::kBigInt];
+        std::unordered_set<unsigned> ints = std::move(type_map.extract(TypeId::kInt).mapped());
+        big_ints.insert(ints.begin(), ints.end());
+    }
+
+    return type_map;
+}
+
+std::vector<TypeId> TypedColumnDataFactory::GetTypesLayout(TypeMap const& tm) const {
+    std::vector<TypeId> types_layout(unparsed_.size(), TypeId::kString);
+
+    for (auto const& [type_id, indices] : tm) {
+        for (unsigned const index : indices) {
+            types_layout[index] = type_id;
+        }
+    }
+
+    return types_layout;
+}
+
+TypedColumnDataFactory::TypeIdToType TypedColumnDataFactory::MapTypeIdsToTypes(
+    TypeMap const& tm) const {
+    std::unordered_map<TypeId, std::unique_ptr<Type>> type_id_to_type;
+    for (auto const& [type_id, indices] : tm) {
+        type_id_to_type.emplace(type_id, CreateType(type_id, is_null_equal_null_));
+    }
+    return type_id_to_type;
+}
+
+size_t TypedColumnDataFactory::CalculateMixedBufSize(MixedType const* mixed,
+                                                     TypeIdToType const& type_id_to_type,
+                                                     TypeMap const& type_map) const noexcept {
+    size_t result = 0;
+    for (auto const& [type_id, indices] : type_map) {
+        result += (mixed->GetMixedValueSize(type_id_to_type.at(type_id).get())) * indices.size();
+    }
+    return result;
+}
+
+TypedColumnData TypedColumnDataFactory::CreateMixedFromTypeMap(std::unique_ptr<Type const> type,
+                                                               TypeMap type_map) {
+    assert(type->GetTypeId() == +TypeId::kMixed);
+    MixedType const* mixed_type = static_cast<MixedType const*>(type.get());
+    std::vector<std::byte const*> data;
+    data.reserve(unparsed_.size());
+
+    std::unordered_set<unsigned> const& nulls = type_map[TypeId::kNull];
+    std::unordered_set<unsigned> const& empties = type_map[TypeId::kEmpty];
+    unsigned int const rows_num = unparsed_.size();
+    unsigned int const nulls_num = nulls.size();
+    unsigned int const empties_num = empties.size();
+
+    TypeIdToType type_id_to_type = MapTypeIdsToTypes(type_map);
+    size_t const buf_size = CalculateMixedBufSize(mixed_type, type_id_to_type, type_map);
+    std::unique_ptr<std::byte[]> buf(new std::byte[buf_size]);
+    std::vector<TypeId> types_layout = GetTypesLayout(type_map);
+    type_map.clear(); /* type_map is no longer needed, so saving space */
+
+    std::byte* next = buf.get();
+    for (size_t i = 0; i != types_layout.size(); ++i) {
+        TypeId const type_id = types_layout[i];
+        Type const* concrete_type = type_id_to_type.at(type_id).get();
+        size_t const value_size = mixed_type->GetMixedValueSize(concrete_type);
+
+        assert(next + value_size <= buf.get() + buf_size);
+
+        mixed_type->ValueFromStr(next, std::move(unparsed_[i]), concrete_type);
+
+        data.push_back(next);
+        next += value_size;
+    }
+
+    return TypedColumnData(column_, std::move(type), rows_num, nulls_num, empties_num,
+                           std::move(buf), std::move(data), {}, {});
+}
+
+TypedColumnData TypedColumnDataFactory::CreateConcreteFromTypeMap(std::unique_ptr<Type const> type,
+                                                                  TypeMap type_map) {
+    TypeId const type_id = type->GetTypeId();
+
+    if (type_id == +TypeId::kMixed) {
+        /* For mixed type use CreateMixedFromTypeMap. */
+        assert(0);
+    }
+
+    std::unordered_set<unsigned> nulls = std::move(type_map[TypeId::kNull]);
+    std::unordered_set<unsigned> empties = std::move(type_map[TypeId::kEmpty]);
+    unsigned int const rows_num = unparsed_.size();
+    unsigned int const nulls_num = nulls.size();
+    unsigned int const empties_num = empties.size();
+    assert(rows_num >= nulls_num + empties_num);
+
+    std::vector<std::byte const*> data(unparsed_.size());
+
+    if (type_id == +TypeId::kUndefined) {
+        return TypedColumnData(column_, std::move(type), rows_num, nulls_num, empties_num, nullptr,
+                               std::move(data), std::move(nulls), std::move(empties));
+    }
+
+    std::unique_ptr<std::byte[]> buf(type->Allocate(rows_num - nulls_num - empties_num));
+
+    unsigned buf_index = 0;
+    size_t const value_size = type->GetSize();
+    for (unsigned i : type_map.at(type_id)) {
+        assert(buf_index <= type->GetSize() * type_map.at(type_id).size());
+        std::byte* next = buf.get() + buf_index;
+        type->ValueFromStr(next, std::move(unparsed_[i]));
+        data[i] = next;
+        buf_index += value_size;
+    }
+
+    return TypedColumnData(column_, std::move(type), rows_num, nulls_num, empties_num,
+                           std::move(buf), std::move(data), std::move(nulls), std::move(empties));
+}
+
+TypedColumnData TypedColumnDataFactory::CreateFromTypeMap(std::unique_ptr<Type const> type,
+                                                          TypeMap type_map) {
+    if (type->GetTypeId() == +TypeId::kMixed) {
+        return CreateMixedFromTypeMap(std::move(type), std::move(type_map));
+    } else {
+        return CreateConcreteFromTypeMap(std::move(type), std::move(type_map));
+    }
+}
+
+TypedColumnData TypedColumnDataFactory::CreateFrom() {
+    TypeMap type_map = CreateTypeMap();
+
+    decltype(type_map)::node_type null_node = type_map.extract(TypeId::kNull);
+    decltype(type_map)::node_type empty_node = type_map.extract(TypeId::kEmpty);
+
+    TypeId type_id = TypeId::kMixed;
+
+    if (type_map.empty()) {
+        type_id = TypeId::kUndefined;
+    } else if (type_map.size() == 1) {
+        type_id = type_map.begin()->first;
+    }
+
+    type_map.insert(std::move(null_node));
+    type_map.insert(std::move(empty_node));
+
+    return CreateFromTypeMap(CreateType(type_id, is_null_equal_null_), std::move(type_map));
+}
+
+}  // namespace model