From 03d967cd00386f68b8dc1792a31fb71f2ba8098c Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 6 Mar 2025 19:10:46 +0800 Subject: [PATCH 1/6] debug: support parallel build for hash key --- lib/index_column.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/index_column.c b/lib/index_column.c index a8874173ee..980cd323fe 100644 --- a/lib/index_column.c +++ b/lib/index_column.c @@ -181,6 +181,7 @@ grn_index_column_build(grn_ctx *ctx, grn_obj *index_column) NULL, NULL); switch (lexicon_flags & GRN_OBJ_TABLE_TYPE_MASK) { + case GRN_OBJ_TABLE_HASH_KEY: case GRN_OBJ_TABLE_PAT_KEY: case GRN_OBJ_TABLE_DAT_KEY: use_grn_ii_build = true; From 3ac760039ca613f39cf286da24e32ffe09df85d5 Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 6 Mar 2025 19:22:15 +0800 Subject: [PATCH 2/6] debug: check progress --- lib/ii.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/ii.cpp b/lib/ii.cpp index 035271b167..5d482c81ef 100644 --- a/lib/ii.cpp +++ b/lib/ii.cpp @@ -17298,6 +17298,7 @@ namespace grn::ii { } } value = global_tid; + printf("flush_term[%d] %u\n", n_blocks_, global_tid); p = file_buf_ + file_buf_offset_; if (value < 1U << 5) { p[0] = static_cast(value); @@ -17513,6 +17514,7 @@ namespace grn::ii { } auto n_processed_records = block_builder->n_processed_records(); if (block_builder->have_data()) { + printf("offset: %u\n", offset); auto rc = flush_block_builder(block_builder.get()); if (rc != GRN_SUCCESS) { break; @@ -18294,6 +18296,7 @@ namespace grn::ii { return rc; } blocks_[i].tid = static_cast(value); + printf("block[%d].tid = %u\n", i, blocks_[i].tid); } auto cursor = grn_table_cursor_open(ctx_, @@ -18306,14 +18309,17 @@ namespace grn::ii { -1, GRN_CURSOR_BY_KEY); for (;;) { + printf("type: %u\n", ii_->lexicon->header.type); grn_id tid = grn_table_cursor_next(ctx_, cursor); if (tid == GRN_ID_NIL) { break; } + printf("commit: tid: %u\n", tid); chunk_.tid = tid; chunk_.rid = GRN_ID_NIL; df_ = 0; for (uint32_t i = 0; i < n_blocks_; i++) { + printf("block[%d].tid: %u == tid: %u\n", i, blocks_[i].tid, tid); if (tid == blocks_[i].tid) { auto rc = read_to_chunk(i); if (rc != GRN_SUCCESS) { From 5fce3a3bbcd86c4d138cd458cb702f8c6f78345c Mon Sep 17 00:00:00 2001 From: otegami Date: Thu, 6 Mar 2025 19:22:53 +0800 Subject: [PATCH 3/6] test: dummy parallel test for hash key --- .../source/offline/parallel_for_hash_key.test | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 test/command/suite/column_create/vector/source/offline/parallel_for_hash_key.test diff --git a/test/command/suite/column_create/vector/source/offline/parallel_for_hash_key.test b/test/command/suite/column_create/vector/source/offline/parallel_for_hash_key.test new file mode 100644 index 0000000000..91db23a6b7 --- /dev/null +++ b/test/command/suite/column_create/vector/source/offline/parallel_for_hash_key.test @@ -0,0 +1,17 @@ +table_create Data TABLE_NO_KEY +column_create Data value1 COLUMN_SCALAR Text + +#@timeout 120 +#@disable-logging +#@generate-series 1 10241 Data '{"value1" => i.to_s.chars.join(" ")}' +#@enable-logging +#@timeout default + +table_create Terms TABLE_HASH_KEY ShortText \ + --default_tokenizer TokenNgram \ + --normalizers NormalizerNFKC +column_create Terms data_values COLUMN_INDEX|WITH_POSITION \ + Data value1 --n_workers -1 + +#@timeout 240 +index_column_diff Terms data_values From 41477a2c3ee68ae4833736c3ae078fb007eb2a6a Mon Sep 17 00:00:00 2001 From: otegami Date: Fri, 7 Mar 2025 14:46:07 +0800 Subject: [PATCH 4/6] define grn_hash_cursor_open_by_key to sort grn_hash by key --- lib/hash.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/hash.c b/lib/hash.c index 4c00a910f9..8db0566523 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -4328,6 +4328,14 @@ grn_hash_delete(grn_ctx *ctx, grn_hash *hash, const void *key, uint32_t key_size } } +static grn_hash_cursor * +grn_hash_cursor_open_by_key(grn_ctx *ctx, grn_hash *hash, + const void *min, uint32_t min_size, + const void *max, uint32_t max_size, + int offset, int limit, int flags) +{ +} + void grn_hash_cursor_close(grn_ctx *ctx, grn_hash_cursor *c) { @@ -4350,6 +4358,17 @@ grn_hash_cursor_open(grn_ctx *ctx, grn_hash *hash, return NULL; } if (!(c = GRN_CALLOC(sizeof(grn_hash_cursor)))) { return NULL; } + if ((flags & GRN_CURSOR_BY_KEY)) { + return grn_hash_cursor_open_by_key(ctx, + hash, + min, + min_size, + max, + max_size, + offset, + limit, + flags); + } GRN_DB_OBJ_SET_TYPE(c, GRN_CURSOR_TABLE_HASH_KEY); c->hash = hash; c->ctx = ctx; From 2e07f8e11886e320150499ee5fc3e9ee3071d155 Mon Sep 17 00:00:00 2001 From: otegami Date: Fri, 7 Mar 2025 15:31:36 +0800 Subject: [PATCH 5/6] define struct to put sorted hash entries --- lib/grn_hash.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/grn_hash.h b/lib/grn_hash.h index a822d3ee89..e68f64c1d3 100644 --- a/lib/grn_hash.h +++ b/lib/grn_hash.h @@ -299,6 +299,14 @@ struct _grn_hash_header_large { grn_id garbages[GRN_HASH_MAX_KEY_SIZE_LARGE]; }; +struct _grn_hash_cursor_sorted_entires { + grn_id *sorted_entries; + uint32_t n_entries; + uint32_t curr_idx; +}; + +typedef struct _grn_hash_cursor_sorted_entires grn_hash_cursor_sorted_entires; + struct _grn_hash_cursor { grn_db_obj obj; grn_hash *hash; @@ -307,6 +315,7 @@ struct _grn_hash_cursor { grn_id tail; unsigned int rest; int dir; + grn_hash_cursor_sorted_entires *sorted_entries; }; /* deprecated */ From 9f677edf2f727ac4cca9297e90020e593e5d15ff Mon Sep 17 00:00:00 2001 From: otegami Date: Fri, 7 Mar 2025 16:59:15 +0800 Subject: [PATCH 6/6] [no ci] debug: wip --- lib/grn_hash.h | 10 +++---- lib/hash.c | 73 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/lib/grn_hash.h b/lib/grn_hash.h index e68f64c1d3..be5f6a078c 100644 --- a/lib/grn_hash.h +++ b/lib/grn_hash.h @@ -299,13 +299,13 @@ struct _grn_hash_header_large { grn_id garbages[GRN_HASH_MAX_KEY_SIZE_LARGE]; }; -struct _grn_hash_cursor_sorted_entires { - grn_id *sorted_entries; +struct _grn_hash_cursor_sorted_entries { + grn_array *sorted_entries; uint32_t n_entries; - uint32_t curr_idx; + uint32_t curr; }; -typedef struct _grn_hash_cursor_sorted_entires grn_hash_cursor_sorted_entires; +typedef struct _grn_hash_cursor_sorted_entries grn_hash_cursor_sorted_entries; struct _grn_hash_cursor { grn_db_obj obj; @@ -315,7 +315,7 @@ struct _grn_hash_cursor { grn_id tail; unsigned int rest; int dir; - grn_hash_cursor_sorted_entires *sorted_entries; + grn_hash_cursor_sorted_entries *sorted_entries; }; /* deprecated */ diff --git a/lib/hash.c b/lib/hash.c index 8db0566523..c19ab0067e 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -4334,12 +4334,60 @@ grn_hash_cursor_open_by_key(grn_ctx *ctx, grn_hash *hash, const void *max, uint32_t max_size, int offset, int limit, int flags) { + int dir; + grn_hash_cursor *c; + if (!ctx || !hash) { + return NULL; + } + if (!(c = GRN_CALLOC(sizeof(grn_hash_cursor)))) { + return NULL; + } + GRN_DB_OBJ_SET_TYPE(c, GRN_CURSOR_TABLE_HASH_KEY); + c->ctx = ctx; + c->hash = hash; + c->obj.header.flags = (grn_obj_flags){flags}; + c->obj.header.domain = GRN_ID_NIL; + c->dir = dir = 1; // TODO: support for decendin order. + + // TODO: Is GRN_ARRAY_TINY really enough? + grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), GRN_ARRAY_TINY); + if (!sorted) { + GRN_LOG(ctx, + GRN_LOG_ALERT, + "grn_hash_sort on grn_hash_cursor_open_by_key failed !"); + grn_hash_close(ctx, hash); + return NULL; + } + grn_table_sort_optarg sort_opt = {0}; + int n_sorted = grn_hash_sort(ctx, c->hash, limit, sorted, &sort_opt); + if (n_sorted == 0) { + grn_array_close(ctx, sorted); + return NULL; + } + + size_t start_idx = (offset > 0) ? offset : 0; + grn_hash_cursor_sorted_entries *sorted_entries = + GRN_MALLOC(sizeof(grn_hash_cursor_sorted_entries)); + if (!sorted_entries) { + grn_array_close(ctx, sorted); + return NULL; + } + sorted_entries->sorted_entries = sorted; + sorted_entries->n_entries = grn_array_size(ctx, sorted); + sorted_entries->curr = start_idx; + c->sorted_entries = sorted_entries; + c->rest = (limit < 0) ? GRN_ARRAY_MAX : (unsigned int)limit; + + return c; } void grn_hash_cursor_close(grn_ctx *ctx, grn_hash_cursor *c) { GRN_ASSERT(c->ctx == ctx); + if (c->sorted_entries) { + GRN_FREE(c->sorted_entries); + } GRN_FREE(c); } @@ -4358,7 +4406,7 @@ grn_hash_cursor_open(grn_ctx *ctx, grn_hash *hash, return NULL; } if (!(c = GRN_CALLOC(sizeof(grn_hash_cursor)))) { return NULL; } - if ((flags & GRN_CURSOR_BY_KEY)) { + if (!(flags & GRN_CURSOR_BY_ID)) { return grn_hash_cursor_open_by_key(ctx, hash, min, @@ -4434,13 +4482,24 @@ grn_id grn_hash_cursor_next(grn_ctx *ctx, grn_hash_cursor *c) { if (c && c->rest) { - while (c->curr_rec != c->tail) { - c->curr_rec = (grn_id)((int64_t)(c->curr_rec) + c->dir); - if (*c->hash->n_entries != HASH_CURR_MAX(c->hash)) { - if (!grn_hash_bitmap_at(ctx, c->hash, c->curr_rec)) { continue; } - } + if (c->sorted_entries) { + grn_hash_cursor_sorted_entries *se = c->sorted_entries; + if (se->curr >= se->n_entries) + return GRN_ID_NIL; + grn_array *array = se->sorted_entries; + grn_id id = grn_array_at(ctx, array, se->curr); + se->curr++; c->rest--; - return c->curr_rec; + return id; + } else { + while (c->curr_rec != c->tail) { + c->curr_rec = (grn_id)((int64_t)(c->curr_rec) + c->dir); + if (*c->hash->n_entries != HASH_CURR_MAX(c->hash)) { + if (!grn_hash_bitmap_at(ctx, c->hash, c->curr_rec)) { continue; } + } + c->rest--; + return c->curr_rec; + } } } return GRN_ID_NIL;