Skip to content

Commit b3b1198

Browse files
authored
Offload entry root slice to QE (#385)
For some queries, Sort, (Finalized) Aggregate and Join are executed on QD, which increase workload on a single point. This PR alleviates this by offloading these operators to a QE. Specifically, it checks whether the root slice (the slice of the Plan before it meets the first Motion) contains these operators in should_offload_entry_to_qe_plan_walker, then it checks whether the offloading can be performed safely in safe_to_offload_entry_to_qe_rte_walker by walking the range table. This implementation specifically does not rely on RelOptInfo or anything postgres-optimizer-specific so that we can port this to Orca in the future.
1 parent 75f6d24 commit b3b1198

File tree

9 files changed

+1312
-1
lines changed

9 files changed

+1312
-1
lines changed

src/backend/optimizer/plan/createplan.c

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "optimizer/tlist.h"
4747
#include "parser/parse_clause.h"
4848
#include "parser/parsetree.h"
49+
#include "partitioning/partdesc.h"
4950
#include "partitioning/partprune.h"
5051
#include "utils/lsyscache.h"
5152
#include "utils/uri.h"
@@ -102,6 +103,11 @@ typedef struct
102103
bool result;
103104
} contain_motion_walk_context;
104105

106+
typedef struct
107+
{
108+
bool computeOnSlice; /* does root slice contain computation node (Sort, Join, Agg) */
109+
} offload_entry_to_qe_plan_walk_context;
110+
105111
static Plan *create_scan_plan(PlannerInfo *root, Path *best_path,
106112
int flags);
107113
static List *build_path_tlist(PlannerInfo *root, Path *path);
@@ -9063,4 +9069,160 @@ push_locus_down_after_elide_motion(Plan* plan)
90639069
plan = plan->lefttree;
90649070
}
90659071
}
9066-
}
9072+
}
9073+
9074+
/*
9075+
* Restore Entry locus to SingleQE in the root slice.
9076+
* This is simply a reverse of push_locus_down_after_elide_motion.
9077+
* The difference is that it's NOT used when creating a plan but rather
9078+
* after a plan gets created, it's used to modify the plan in offload_entry_to_qe.
9079+
*/
9080+
static void
9081+
replace_entry_locus_with_singleqe(Plan *plan)
9082+
{
9083+
while (plan && (plan->locustype == CdbLocusType_Entry))
9084+
{
9085+
plan->locustype = CdbLocusType_SingleQE;
9086+
switch (nodeTag(plan))
9087+
{
9088+
case T_Motion:
9089+
return;
9090+
case T_Append:
9091+
{
9092+
List *subplans = NIL;
9093+
ListCell *cell;
9094+
subplans = ((Append *) (plan))->appendplans;
9095+
foreach(cell, subplans)
9096+
{
9097+
replace_entry_locus_with_singleqe(lfirst(cell));
9098+
}
9099+
break;
9100+
}
9101+
case T_SubqueryScan:
9102+
plan = ((SubqueryScan *)(plan))->subplan;
9103+
break;
9104+
case T_NestLoop:
9105+
case T_MergeJoin:
9106+
case T_HashJoin:
9107+
replace_entry_locus_with_singleqe(plan->righttree);
9108+
/* FALLTHROUGH */
9109+
default:
9110+
plan = plan->lefttree;
9111+
break;
9112+
}
9113+
}
9114+
}
9115+
9116+
/*
9117+
* Check whether we can safely offload root slice on QD to a QE.
9118+
*/
9119+
static bool
9120+
safe_to_offload_entry_to_qe_rte_walker(List *rtes)
9121+
{
9122+
ListCell *lc;
9123+
foreach(lc, rtes)
9124+
{
9125+
RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
9126+
if (rte->rtekind == RTE_RELATION)
9127+
{
9128+
// check if any partition of a partitioned table is a coordinator-only external/foreign table
9129+
if (rte->relkind == RELKIND_PARTITIONED_TABLE)
9130+
{
9131+
Relation rel;
9132+
PartitionDesc desc;
9133+
9134+
rel = relation_open(rte->relid, NoLock);
9135+
desc = RelationGetPartitionDesc(rel, true);
9136+
relation_close(rel, NoLock);
9137+
for (int i = 0; i < desc->nparts; i++)
9138+
{
9139+
if (GpPolicyIsEntry(GpPolicyFetch(desc->oids[i])))
9140+
return false;
9141+
}
9142+
return true;
9143+
}
9144+
else
9145+
return !GpPolicyIsEntry(GpPolicyFetch(rte->relid));
9146+
}
9147+
else if (rte->rtekind == RTE_SUBQUERY)
9148+
{
9149+
if (!safe_to_offload_entry_to_qe_rte_walker(rte->subquery->rtable))
9150+
return false;
9151+
}
9152+
}
9153+
return true;
9154+
}
9155+
9156+
/*
9157+
* Check if there are multiple Motion in which the root slice contains computation (Sort, Join or Aggregate).
9158+
*/
9159+
static bool
9160+
should_offload_entry_to_qe_plan_walker(Plan *plan, offload_entry_to_qe_plan_walk_context *ctx)
9161+
{
9162+
while (plan && plan->locustype == CdbLocusType_Entry)
9163+
{
9164+
switch (nodeTag(plan))
9165+
{
9166+
case T_Motion:
9167+
return ctx->computeOnSlice;
9168+
case T_SubqueryScan:
9169+
plan = ((SubqueryScan *) plan)->subplan;
9170+
break;
9171+
/* join */
9172+
case T_Join:
9173+
case T_MergeJoin:
9174+
case T_HashJoin:
9175+
case T_NestLoop:
9176+
ctx->computeOnSlice = true;
9177+
if (should_offload_entry_to_qe_plan_walker(plan->righttree, ctx))
9178+
return true;
9179+
plan = plan->lefttree;
9180+
break;
9181+
/* sort */
9182+
case T_Sort:
9183+
/* aggregates*/
9184+
case T_Agg:
9185+
case T_WindowAgg:
9186+
ctx->computeOnSlice = true;
9187+
/* FALLTHROUGH */
9188+
default:
9189+
plan = plan->lefttree;
9190+
break;
9191+
}
9192+
}
9193+
return false;
9194+
}
9195+
9196+
Plan *
9197+
offload_entry_to_qe(PlannerInfo *root, Plan *plan, int sendslice_parallel)
9198+
{
9199+
offload_entry_to_qe_plan_walk_context plan_walk_ctx;
9200+
plan_walk_ctx.computeOnSlice = false;
9201+
9202+
if (root->parse->commandType == CMD_SELECT &&
9203+
should_offload_entry_to_qe_plan_walker(plan, &plan_walk_ctx) &&
9204+
safe_to_offload_entry_to_qe_rte_walker(root->parse->rtable) &&
9205+
!contain_volatile_functions((Node *) root->parse))
9206+
{
9207+
CdbPathLocus entrylocus;
9208+
PlanSlice *sendSlice;
9209+
sendSlice = (PlanSlice *) palloc0(sizeof(PlanSlice));
9210+
sendSlice->gangType = GANGTYPE_SINGLETON_READER;
9211+
sendSlice->numsegments = 1;
9212+
sendSlice->sliceIndex = -1;
9213+
sendSlice->parallel_workers = sendslice_parallel;
9214+
sendSlice->segindex = gp_session_id % getgpsegmentCount();
9215+
9216+
replace_entry_locus_with_singleqe(plan);
9217+
9218+
plan = (Plan *) make_union_motion(plan);
9219+
((Motion *) plan)->senderSliceInfo = sendSlice;
9220+
9221+
plan->locustype = CdbLocusType_Entry;
9222+
CdbPathLocus_MakeEntry(&entrylocus);
9223+
if (plan->flow)
9224+
pfree(plan->flow);
9225+
plan->flow = cdbpathtoplan_create_flow(root, entrylocus);
9226+
}
9227+
return plan;
9228+
}

src/backend/optimizer/plan/planner.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,12 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
594594
/* Decorate the top node of the plan with a Flow node. */
595595
top_plan->flow = cdbpathtoplan_create_flow(root, best_path->locus);
596596

597+
/* Modifier: If root slice is executed on QD, try to offload it to a QE */
598+
if (enable_offload_entry_to_qe && Gp_role == GP_ROLE_DISPATCH)
599+
{
600+
top_plan = offload_entry_to_qe(root, top_plan, best_path->locus.parallel_workers);
601+
}
602+
597603
/*
598604
* If creating a plan for a scrollable cursor, make sure it can run
599605
* backwards on demand. Add a Material node at the top at need.

src/backend/utils/misc/guc_gp.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,7 @@ bool gp_enable_global_deadlock_detector = false;
430430
bool gp_enable_predicate_pushdown;
431431
int gp_predicate_pushdown_sample_rows;
432432

433+
bool enable_offload_entry_to_qe = false;
433434
bool enable_answer_query_using_materialized_views = false;
434435

435436
static const struct config_enum_entry gp_log_format_options[] = {
@@ -2429,6 +2430,16 @@ struct config_bool ConfigureNamesBool_gp[] =
24292430
true,
24302431
NULL, NULL, NULL
24312432
},
2433+
{
2434+
{"enable_offload_entry_to_qe", PGC_USERSET, DEVELOPER_OPTIONS,
2435+
gettext_noop("Enable plans with operations on coordinator to be offloaded to QEs."),
2436+
NULL,
2437+
GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_GPDB_NO_SYNC
2438+
},
2439+
&enable_offload_entry_to_qe,
2440+
false,
2441+
NULL, NULL, NULL
2442+
},
24322443
{
24332444
{"optimizer_enable_gather_on_segment_for_dml", PGC_USERSET, DEVELOPER_OPTIONS,
24342445
gettext_noop("Enable DML optimization by enforcing a non-master gather in the optimizer."),

src/include/optimizer/planmain.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ extern Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan);
8080
extern Plan *add_sort_cost(PlannerInfo *root, Plan *input,
8181
double limit_tuples);
8282
extern Plan *plan_pushdown_tlist(PlannerInfo *root, Plan *plan, List *tlist); /*CDB*/
83+
extern Plan *offload_entry_to_qe(PlannerInfo *root, Plan *plan, int sendslice_parallel); /* CBDB */
8384

8485
/* External use of these functions is deprecated: */
8586
extern Sort *make_sort_from_sortclauses(List *sortcls,

src/include/utils/guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ extern bool enable_parallel;
301301
extern int gp_appendonly_insert_files;
302302
extern int gp_appendonly_insert_files_tuples_range;
303303
extern bool enable_answer_query_using_materialized_views;
304+
extern bool enable_offload_entry_to_qe;
304305
/*
305306
* gp_enable_multiphase_limit is not cost based.
306307
* When set to false, the planner will not use multi-phase limit.

src/include/utils/unsync_guc_name.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@
150150
"enable_seqscan",
151151
"enable_sort",
152152
"enable_tidscan",
153+
"enable_offload_entry_to_qe",
153154
"escape_string_warning",
154155
"event_source",
155156
"exit_on_error",

0 commit comments

Comments
 (0)