Skip to content

Commit 497b043

Browse files
committed
Apache DataSketches plugin for Trino
Plugin to query Apache DataSketches (https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html). This includes merge and estimate function for theta sketch.
1 parent e35c6f5 commit 497b043

File tree

20 files changed

+1225
-0
lines changed

20 files changed

+1225
-0
lines changed

core/trino-server/src/main/provisio/trino.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@
4040
</artifact>
4141
</artifactSet>
4242

43+
<artifactSet to="plugin/datasketches">
44+
<artifact id="${project.groupId}:trino-datasketches:zip:${project.version}">
45+
<unpack />
46+
</artifact>
47+
</artifactSet>
48+
4349
<artifactSet to="plugin/delta-lake">
4450
<artifact id="${project.groupId}:trino-delta-lake:zip:${project.version}">
4551
<unpack />

docs/src/main/sphinx/functions.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Color <functions/color>
4040
Comparison <functions/comparison>
4141
Conditional <functions/conditional>
4242
Conversion <functions/conversion>
43+
Datasketches <functions/datasketches>
4344
Date and time <functions/datetime>
4445
Decimal <functions/decimal>
4546
Geospatial <functions/geospatial>
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# DataSketches functions
2+
3+
DataSketches is a high-performance library of stochastic streaming
4+
algorithms, commonly called sketches. Sketches are small, stateful programs
5+
that process massive data as a stream and can provide approximate answers,
6+
with mathematical guarantees, much faster than traditional exact methods.
7+
The DataSketches functions allow querying the fast and memory-efficient
8+
[Apache DataSketches](https://datasketches.apache.org/docs/Community/Research.html)
9+
from Trino. Support for the
10+
[Theta Sketch framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html)
11+
is available through {func}`theta_sketch_union` and
12+
{func}`theta_sketch_estimate`, typically used in `COUNT DISTINCT` queries.
13+
DataSketches can be created with Hive or Pig using their respective sketch
14+
APIs.
15+
16+
To use these functions, configure a catalog with the
17+
``datasketches`` connector (for example, name the catalog ``datasketches``)
18+
and either qualify function calls with ``datasketches.theta.`` or add
19+
``datasketches.theta`` to the session ``path``.
20+
21+
Note: Trino does not create new sketches. Build the Theta sketches in your
22+
ingest pipeline or another engine (e.g., Spark or Hive) and store the
23+
serialized sketch bytes as a ``VARBINARY`` column. Trino functions operate on those serialized sketches.
24+
25+
## Functions
26+
27+
:::{function} theta_sketch_union(sketches [, nominal_entries, seed]) -> varbinary
28+
Returns a serialized sketch (VARBINARY) which is a merged collection of sketches. The optional
29+
`nominal_entries` and `seed` parameters let you specify non-default sketch size and seed when
30+
merging sketches created with custom settings.
31+
:::
32+
33+
:::{function} theta_sketch_estimate(sketch) -> double
34+
Returns the estimated value of the sketch.
35+
:::
36+
37+
::: {function} theta_sketch_estimate(sketch, seed) -> double
38+
:noindex: true
39+
40+
Returns the estimated value of the sketch using the supplied `seed`. Use
41+
this when the sketch was created with a non-default seed.
42+
:::
43+
44+
## Examples
45+
46+
```sql
47+
SELECT
48+
o_orderdate AS date,
49+
theta_sketch_estimate(theta_sketch_union(o_custkey_sketch)) AS unique_user_count,
50+
SUM(o_totalprice) AS user_spent
51+
FROM tpch.sf100000.orders
52+
WHERE o_orderdate >= dateadd(day, -90, current_date)
53+
GROUP BY o_orderdate;
54+
```

docs/src/main/sphinx/functions/list-by-topic.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,13 @@ For more details, see {doc}`conversion`
176176
- {func}`try_cast`
177177
- {func}`typeof`
178178

179+
## DataSketches
180+
181+
For more details, see {doc}`datasketches`
182+
183+
- {func}`theta_sketch_estimate`
184+
- {func}`theta_sketch_union`
185+
179186
## Date and time
180187

181188
For more details, see {doc}`datetime`

docs/src/main/sphinx/functions/list.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,8 @@
464464
- {func}`tan`
465465
- {func}`tanh`
466466
- {func}`tdigest_agg`
467+
- {func}`theta_sketch_estimate`
468+
- {func}`theta_sketch_union`
467469
- {func}`timestamp_objectid`
468470
- {func}`timezone`
469471
- {func}`timezone_hour`

plugin/trino-datasketches/pom.xml

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
4+
5+
<parent>
6+
<groupId>io.trino</groupId>
7+
<artifactId>trino-root</artifactId>
8+
<version>479-SNAPSHOT</version>
9+
<relativePath>../../pom.xml</relativePath>
10+
</parent>
11+
12+
<artifactId>trino-datasketches</artifactId>
13+
<packaging>trino-plugin</packaging>
14+
<name>${project.artifactId}</name>
15+
<description>Trino - DataSketches functions</description>
16+
17+
<properties>
18+
<air.main.basedir>${project.parent.basedir}</air.main.basedir>
19+
</properties>
20+
21+
<dependencies>
22+
23+
<dependency>
24+
<groupId>com.google.guava</groupId>
25+
<artifactId>guava</artifactId>
26+
</dependency>
27+
<dependency>
28+
<groupId>io.trino</groupId>
29+
<artifactId>trino-array</artifactId>
30+
</dependency>
31+
32+
<dependency>
33+
<groupId>io.trino</groupId>
34+
<artifactId>trino-main</artifactId>
35+
<exclusions>
36+
<exclusion>
37+
<groupId>*</groupId>
38+
<artifactId>*</artifactId>
39+
</exclusion>
40+
</exclusions>
41+
</dependency>
42+
43+
<dependency>
44+
<groupId>org.apache.datasketches</groupId>
45+
<artifactId>datasketches-java</artifactId>
46+
<version>9.0.0</version>
47+
</dependency>
48+
49+
<dependency>
50+
<groupId>io.airlift</groupId>
51+
<artifactId>slice</artifactId>
52+
<scope>provided</scope>
53+
</dependency>
54+
55+
<!-- Trino SPI -->
56+
<dependency>
57+
<groupId>io.trino</groupId>
58+
<artifactId>trino-spi</artifactId>
59+
<scope>provided</scope>
60+
</dependency>
61+
62+
<dependency>
63+
<groupId>org.openjdk.jol</groupId>
64+
<artifactId>jol-core</artifactId>
65+
<scope>provided</scope>
66+
</dependency>
67+
68+
<dependency>
69+
<groupId>io.airlift</groupId>
70+
<artifactId>junit-extensions</artifactId>
71+
<scope>test</scope>
72+
</dependency>
73+
74+
<dependency>
75+
<groupId>io.trino</groupId>
76+
<artifactId>trino-testing</artifactId>
77+
<scope>test</scope>
78+
</dependency>
79+
80+
<dependency>
81+
<groupId>io.trino</groupId>
82+
<artifactId>trino-tpch</artifactId>
83+
<scope>test</scope>
84+
</dependency>
85+
86+
<dependency>
87+
<groupId>org.assertj</groupId>
88+
<artifactId>assertj-core</artifactId>
89+
<scope>test</scope>
90+
</dependency>
91+
92+
<dependency>
93+
<groupId>org.junit.jupiter</groupId>
94+
<artifactId>junit-jupiter-api</artifactId>
95+
<scope>test</scope>
96+
</dependency>
97+
98+
<dependency>
99+
<groupId>org.junit.jupiter</groupId>
100+
<artifactId>junit-jupiter-engine</artifactId>
101+
<scope>test</scope>
102+
</dependency>
103+
</dependencies>
104+
</project>
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.plugin.datasketches.state;
15+
16+
import io.airlift.slice.Slice;
17+
import io.trino.spi.function.AccumulatorState;
18+
import io.trino.spi.function.AccumulatorStateMetadata;
19+
20+
/**
21+
* State object to keep track of sketch aggregations.
22+
*/
23+
@AccumulatorStateMetadata(stateSerializerClass = SketchStateSerializer.class, stateFactoryClass = SketchStateFactory.class)
24+
public interface SketchState
25+
extends AccumulatorState
26+
{
27+
Slice getSketch();
28+
29+
int getNominalEntries();
30+
31+
long getSeed();
32+
33+
void setSketch(Slice value);
34+
35+
void setNominalEntries(int value);
36+
37+
void setSeed(long value);
38+
39+
void merge(SketchState state);
40+
}

0 commit comments

Comments
 (0)