From efd011549a423e731229f1a9eeb5d864999954f9 Mon Sep 17 00:00:00 2001 From: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Date: Thu, 12 Sep 2024 14:41:20 -0400 Subject: [PATCH 1/4] Added introduced version labels to field types (#8227) Signed-off-by: Fanit Kolchina --- _field-types/supported-field-types/alias.md | 2 ++ _field-types/supported-field-types/binary.md | 2 ++ _field-types/supported-field-types/boolean.md | 2 ++ _field-types/supported-field-types/completion.md | 2 ++ _field-types/supported-field-types/constant-keyword.md | 2 ++ _field-types/supported-field-types/date-nanos.md | 2 ++ _field-types/supported-field-types/date.md | 2 ++ _field-types/supported-field-types/flat-object.md | 2 ++ _field-types/supported-field-types/geo-point.md | 2 ++ _field-types/supported-field-types/geo-shape.md | 2 ++ _field-types/supported-field-types/ip.md | 2 ++ _field-types/supported-field-types/join.md | 2 ++ _field-types/supported-field-types/keyword.md | 2 ++ _field-types/supported-field-types/knn-vector.md | 2 ++ _field-types/supported-field-types/match-only-text.md | 2 ++ _field-types/supported-field-types/nested.md | 2 ++ _field-types/supported-field-types/object.md | 2 ++ _field-types/supported-field-types/percolator.md | 2 ++ _field-types/supported-field-types/range.md | 2 ++ _field-types/supported-field-types/rank.md | 2 ++ _field-types/supported-field-types/search-as-you-type.md | 2 ++ _field-types/supported-field-types/text.md | 2 ++ _field-types/supported-field-types/token-count.md | 2 ++ _field-types/supported-field-types/unsigned-long.md | 2 ++ _field-types/supported-field-types/wildcard.md | 2 ++ _field-types/supported-field-types/xy-point.md | 2 ++ _field-types/supported-field-types/xy-shape.md | 2 ++ 27 files changed, 54 insertions(+) diff --git a/_field-types/supported-field-types/alias.md b/_field-types/supported-field-types/alias.md index 29cc58885c..f1f6ae9ac8 100644 --- a/_field-types/supported-field-types/alias.md +++ b/_field-types/supported-field-types/alias.md @@ -10,6 +10,8 @@ redirect_from: --- # Alias field type +**Introduced 1.0** +{: .label .label-purple } An alias field type creates another name for an existing field. You can use aliases in the[search](#using-aliases-in-search-api-operations) and [field capabilities](#using-aliases-in-field-capabilities-api-operations) API operations, with some [exceptions](#exceptions). To set up an [alias](#alias-field), you need to specify the [original field](#original-field) name in the `path` parameter. diff --git a/_field-types/supported-field-types/binary.md b/_field-types/supported-field-types/binary.md index 99d468c1dc..bb257bf7ec 100644 --- a/_field-types/supported-field-types/binary.md +++ b/_field-types/supported-field-types/binary.md @@ -10,6 +10,8 @@ redirect_from: --- # Binary field type +**Introduced 1.0** +{: .label .label-purple } A binary field type contains a binary value in [Base64](https://en.wikipedia.org/wiki/Base64) encoding that is not searchable. diff --git a/_field-types/supported-field-types/boolean.md b/_field-types/supported-field-types/boolean.md index 8233a45ad5..82cfdecf47 100644 --- a/_field-types/supported-field-types/boolean.md +++ b/_field-types/supported-field-types/boolean.md @@ -10,6 +10,8 @@ redirect_from: --- # Boolean field type +**Introduced 1.0** +{: .label .label-purple } A Boolean field type takes `true` or `false` values, or `"true"` or `"false"` strings. You can also pass an empty string (`""`) in place of a `false` value. diff --git a/_field-types/supported-field-types/completion.md b/_field-types/supported-field-types/completion.md index 85c803baa1..e6e392fb6d 100644 --- a/_field-types/supported-field-types/completion.md +++ b/_field-types/supported-field-types/completion.md @@ -11,6 +11,8 @@ redirect_from: --- # Completion field type +**Introduced 1.0** +{: .label .label-purple } A completion field type provides autocomplete functionality through a completion suggester. The completion suggester is a prefix suggester, so it matches the beginning of text only. A completion suggester creates an in-memory data structure, which provides faster lookups but leads to increased memory usage. You need to upload a list of all possible completions into the index before using this feature. diff --git a/_field-types/supported-field-types/constant-keyword.md b/_field-types/supported-field-types/constant-keyword.md index bf1e4afc70..4f9261f1a1 100644 --- a/_field-types/supported-field-types/constant-keyword.md +++ b/_field-types/supported-field-types/constant-keyword.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Constant keyword field type +**Introduced 2.14** +{: .label .label-purple } A constant keyword field uses the same value for all documents in the index. diff --git a/_field-types/supported-field-types/date-nanos.md b/_field-types/supported-field-types/date-nanos.md index 12399a69d4..eb569265fc 100644 --- a/_field-types/supported-field-types/date-nanos.md +++ b/_field-types/supported-field-types/date-nanos.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Date nanoseconds field type +**Introduced 1.0** +{: .label .label-purple } The `date_nanos` field type is similar to the [`date`]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/date/) field type in that it holds a date. However, `date` stores the date in millisecond resolution, while `date_nanos` stores the date in nanosecond resolution. Dates are stored as `long` values that correspond to nanoseconds since the epoch. Therefore, the range of supported dates is approximately 1970--2262. diff --git a/_field-types/supported-field-types/date.md b/_field-types/supported-field-types/date.md index fb008d1512..8ca986219b 100644 --- a/_field-types/supported-field-types/date.md +++ b/_field-types/supported-field-types/date.md @@ -11,6 +11,8 @@ redirect_from: --- # Date field type +**Introduced 1.0** +{: .label .label-purple } A date in OpenSearch can be represented as one of the following: diff --git a/_field-types/supported-field-types/flat-object.md b/_field-types/supported-field-types/flat-object.md index 933c385ac5..c9e59710e1 100644 --- a/_field-types/supported-field-types/flat-object.md +++ b/_field-types/supported-field-types/flat-object.md @@ -10,6 +10,8 @@ redirect_from: --- # Flat object field type +**Introduced 2.7** +{: .label .label-purple } In OpenSearch, you don't have to specify a mapping before indexing documents. If you don't specify a mapping, OpenSearch uses [dynamic mapping]({{site.url}}{{site.baseurl}}/field-types/index#dynamic-mapping) to map every field and its subfields in the document automatically. When you ingest documents such as logs, you may not know every field's subfield name and type in advance. In this case, dynamically mapping all new subfields can quickly lead to a "mapping explosion," where the growing number of fields may degrade the performance of your cluster. diff --git a/_field-types/supported-field-types/geo-point.md b/_field-types/supported-field-types/geo-point.md index 0912dc618d..96586d044f 100644 --- a/_field-types/supported-field-types/geo-point.md +++ b/_field-types/supported-field-types/geo-point.md @@ -11,6 +11,8 @@ redirect_from: --- # Geopoint field type +**Introduced 1.0** +{: .label .label-purple } A geopoint field type contains a geographic point specified by latitude and longitude. diff --git a/_field-types/supported-field-types/geo-shape.md b/_field-types/supported-field-types/geo-shape.md index b7b06a0d04..ee98bfca03 100644 --- a/_field-types/supported-field-types/geo-shape.md +++ b/_field-types/supported-field-types/geo-shape.md @@ -11,6 +11,8 @@ redirect_from: --- # Geoshape field type +**Introduced 1.0** +{: .label .label-purple } A geoshape field type contains a geographic shape, such as a polygon or a collection of geographic points. To index a geoshape, OpenSearch tesselates the shape into a triangular mesh and stores each triangle in a BKD tree. This provides a 10-7decimal degree of precision, which represents near-perfect spatial resolution. Performance of this process is mostly impacted by the number of vertices in a polygon you are indexing. diff --git a/_field-types/supported-field-types/ip.md b/_field-types/supported-field-types/ip.md index cb2a5569c8..99b41e45cd 100644 --- a/_field-types/supported-field-types/ip.md +++ b/_field-types/supported-field-types/ip.md @@ -10,6 +10,8 @@ redirect_from: --- # IP address field type +**Introduced 1.0** +{: .label .label-purple } An ip field type contains an IP address in IPv4 or IPv6 format. diff --git a/_field-types/supported-field-types/join.md b/_field-types/supported-field-types/join.md index c83705f4c3..c707c66774 100644 --- a/_field-types/supported-field-types/join.md +++ b/_field-types/supported-field-types/join.md @@ -11,6 +11,8 @@ redirect_from: --- # Join field type +**Introduced 1.0** +{: .label .label-purple } A join field type establishes a parent/child relationship between documents in the same index. diff --git a/_field-types/supported-field-types/keyword.md b/_field-types/supported-field-types/keyword.md index eea6cc664b..ca9c8085f6 100644 --- a/_field-types/supported-field-types/keyword.md +++ b/_field-types/supported-field-types/keyword.md @@ -11,6 +11,8 @@ redirect_from: --- # Keyword field type +**Introduced 1.0** +{: .label .label-purple } A keyword field type contains a string that is not analyzed. It allows only exact, case-sensitive matches. diff --git a/_field-types/supported-field-types/knn-vector.md b/_field-types/supported-field-types/knn-vector.md index a2a7137733..80c4085485 100644 --- a/_field-types/supported-field-types/knn-vector.md +++ b/_field-types/supported-field-types/knn-vector.md @@ -8,6 +8,8 @@ has_math: true --- # k-NN vector field type +**Introduced 1.0** +{: .label .label-purple } The [k-NN plugin]({{site.url}}{{site.baseurl}}/search-plugins/knn/index/) introduces a custom data type, the `knn_vector`, that allows users to ingest their k-NN vectors into an OpenSearch index and perform different kinds of k-NN search. The `knn_vector` field is highly configurable and can serve many different k-NN workloads. In general, a `knn_vector` field can be built either by providing a method definition or specifying a model id. diff --git a/_field-types/supported-field-types/match-only-text.md b/_field-types/supported-field-types/match-only-text.md index fd2c6b5850..534275bd3a 100644 --- a/_field-types/supported-field-types/match-only-text.md +++ b/_field-types/supported-field-types/match-only-text.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Match-only text field type +**Introduced 2.12** +{: .label .label-purple } A `match_only_text` field is a variant of a `text` field designed for full-text search when scoring and positional information of terms within a document are not critical. diff --git a/_field-types/supported-field-types/nested.md b/_field-types/supported-field-types/nested.md index 90d09177d1..f8dfca2ff8 100644 --- a/_field-types/supported-field-types/nested.md +++ b/_field-types/supported-field-types/nested.md @@ -11,6 +11,8 @@ redirect_from: --- # Nested field type +**Introduced 1.0** +{: .label .label-purple } A nested field type is a special type of [object field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/object/). diff --git a/_field-types/supported-field-types/object.md b/_field-types/supported-field-types/object.md index db539a9608..ee50f5af9d 100644 --- a/_field-types/supported-field-types/object.md +++ b/_field-types/supported-field-types/object.md @@ -11,6 +11,8 @@ redirect_from: --- # Object field type +**Introduced 1.0** +{: .label .label-purple } An object field type contains a JSON object (a set of name/value pairs). A value in a JSON object may be another JSON object. It is not necessary to specify `object` as the type when mapping object fields because `object` is the default type. diff --git a/_field-types/supported-field-types/percolator.md b/_field-types/supported-field-types/percolator.md index 92325b6127..2b067cf595 100644 --- a/_field-types/supported-field-types/percolator.md +++ b/_field-types/supported-field-types/percolator.md @@ -10,6 +10,8 @@ redirect_from: --- # Percolator field type +**Introduced 1.0** +{: .label .label-purple } A percolator field type specifies to treat this field as a query. Any JSON object field can be marked as a percolator field. Normally, documents are indexed and searches are run against them. When you use a percolator field, you store a search, and later the percolate query matches documents to that search. diff --git a/_field-types/supported-field-types/range.md b/_field-types/supported-field-types/range.md index 22ae1d619e..1001bae584 100644 --- a/_field-types/supported-field-types/range.md +++ b/_field-types/supported-field-types/range.md @@ -10,6 +10,8 @@ redirect_from: --- # Range field types +**Introduced 1.0** +{: .label .label-purple } The following table lists all range field types that OpenSearch supports. diff --git a/_field-types/supported-field-types/rank.md b/_field-types/supported-field-types/rank.md index a4ec0fac4c..f57c540cf5 100644 --- a/_field-types/supported-field-types/rank.md +++ b/_field-types/supported-field-types/rank.md @@ -10,6 +10,8 @@ redirect_from: --- # Rank field types +**Introduced 1.0** +{: .label .label-purple } The following table lists all rank field types that OpenSearch supports. diff --git a/_field-types/supported-field-types/search-as-you-type.md b/_field-types/supported-field-types/search-as-you-type.md index b9141e6b8e..55774d432a 100644 --- a/_field-types/supported-field-types/search-as-you-type.md +++ b/_field-types/supported-field-types/search-as-you-type.md @@ -11,6 +11,8 @@ redirect_from: --- # Search-as-you-type field type +**Introduced 1.0** +{: .label .label-purple } A search-as-you-type field type provides search-as-you-type functionality using both prefix and infix completion. diff --git a/_field-types/supported-field-types/text.md b/_field-types/supported-field-types/text.md index 16350c0cb3..b06bec2187 100644 --- a/_field-types/supported-field-types/text.md +++ b/_field-types/supported-field-types/text.md @@ -11,6 +11,8 @@ redirect_from: --- # Text field type +**Introduced 1.0** +{: .label .label-purple } A `text` field type contains a string that is analyzed. It is used for full-text search because it allows partial matches. Searches for multiple terms can match some but not all of them. Depending on the analyzer, results can be case insensitive, stemmed, have stopwords removed, have synonyms applied, and so on. diff --git a/_field-types/supported-field-types/token-count.md b/_field-types/supported-field-types/token-count.md index 6c3445e6a7..11eeff7854 100644 --- a/_field-types/supported-field-types/token-count.md +++ b/_field-types/supported-field-types/token-count.md @@ -11,6 +11,8 @@ redirect_from: --- # Token count field type +**Introduced 1.0** +{: .label .label-purple } A token count field type stores the number of analyzed tokens in a string. diff --git a/_field-types/supported-field-types/unsigned-long.md b/_field-types/supported-field-types/unsigned-long.md index dde8d25dee..4c38cb3090 100644 --- a/_field-types/supported-field-types/unsigned-long.md +++ b/_field-types/supported-field-types/unsigned-long.md @@ -8,6 +8,8 @@ has_children: false --- # Unsigned long field type +**Introduced 2.8** +{: .label .label-purple } The `unsigned_long` field type is a numeric field type that represents an unsigned 64-bit integer with a minimum value of 0 and a maximum value of 264 − 1. In the following example, `counter` is mapped as an `unsigned_long` field: diff --git a/_field-types/supported-field-types/wildcard.md b/_field-types/supported-field-types/wildcard.md index c438f35c62..0f8c176135 100644 --- a/_field-types/supported-field-types/wildcard.md +++ b/_field-types/supported-field-types/wildcard.md @@ -8,6 +8,8 @@ grand_parent: Supported field types --- # Wildcard field type +**Introduced 2.15** +{: .label .label-purple } A `wildcard` field is a variant of a `keyword` field designed for arbitrary substring and regular expression matching. diff --git a/_field-types/supported-field-types/xy-point.md b/_field-types/supported-field-types/xy-point.md index 57b6f64758..0d066b9f09 100644 --- a/_field-types/supported-field-types/xy-point.md +++ b/_field-types/supported-field-types/xy-point.md @@ -11,6 +11,8 @@ redirect_from: --- # xy point field type +**Introduced 2.4** +{: .label .label-purple } An xy point field type contains a point in a two-dimensional Cartesian coordinate system, specified by x and y coordinates. It is based on the Lucene [XYPoint](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/geo/XYPoint.html) field type. The xy point field type is similar to the [geopoint]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/geo-point/) field type, but does not have the range limitations of geopoint. The coordinates of an xy point are single-precision floating-point values. For information about the range and precision of floating-point values, see [Numeric field types]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/numeric/). diff --git a/_field-types/supported-field-types/xy-shape.md b/_field-types/supported-field-types/xy-shape.md index f1c7191240..9dcbafceae 100644 --- a/_field-types/supported-field-types/xy-shape.md +++ b/_field-types/supported-field-types/xy-shape.md @@ -11,6 +11,8 @@ redirect_from: --- # xy shape field type +**Introduced 2.4** +{: .label .label-purple } An xy shape field type contains a shape, such as a polygon or a collection of xy points. It is based on the Lucene [XYShape](https://lucene.apache.org/core/9_3_0/core/org/apache/lucene/document/XYShape.html) field type. To index an xy shape, OpenSearch tessellates the shape into a triangular mesh and stores each triangle in a BKD tree (a set of balanced k-dimensional trees). This provides a 10-7decimal degree of precision, which represents near-perfect spatial resolution. From fd2e9fe32efee09a23665390f6089f9a4baf46b3 Mon Sep 17 00:00:00 2001 From: Andriy Redko Date: Fri, 13 Sep 2024 08:57:51 -0400 Subject: [PATCH 2/4] Document new experimental ingestion streaming APIs (#8123) * Document new experimental ingestion streaming APIs Signed-off-by: Andriy Redko * Doc review Signed-off-by: Fanit Kolchina * Small rewording Signed-off-by: Fanit Kolchina * Address review comments Signed-off-by: Andriy Redko * Address review comments Signed-off-by: Andriy Redko * Address review comments Signed-off-by: Andriy Redko * Address review comments Signed-off-by: Andriy Redko * Address review comments Signed-off-by: Andriy Redko * Address review comments Signed-off-by: Andriy Redko --------- Signed-off-by: Andriy Redko Signed-off-by: Fanit Kolchina Co-authored-by: Fanit Kolchina --- .../document-apis/bulk-streaming.md | 81 +++++++++++++++++++ _api-reference/document-apis/bulk.md | 12 +-- 2 files changed, 87 insertions(+), 6 deletions(-) create mode 100644 _api-reference/document-apis/bulk-streaming.md diff --git a/_api-reference/document-apis/bulk-streaming.md b/_api-reference/document-apis/bulk-streaming.md new file mode 100644 index 0000000000..7d05e93c8a --- /dev/null +++ b/_api-reference/document-apis/bulk-streaming.md @@ -0,0 +1,81 @@ +--- +layout: default +title: Streaming bulk +parent: Document APIs +nav_order: 25 +redirect_from: + - /opensearch/rest-api/document-apis/bulk/streaming/ +--- + +# Streaming bulk +**Introduced 2.17.0** +{: .label .label-purple } + +This is an experimental feature and is not recommended for use in a production environment. For updates on the progress of the feature or if you want to leave feedback, see the associated [GitHub issue](https://github.com/opensearch-project/OpenSearch/issues/9065). +{: .warning} + +The streaming bulk operation lets you add, update, or delete multiple documents by streaming the request and getting the results as a streaming response. In comparison to the traditional [Bulk API]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/), streaming ingestion eliminates the need to estimate the batch size (which is affected by the cluster operational state at any given time) and naturally applies backpressure between many clients and the cluster. The streaming works over HTTP/2 or HTTP/1.1 (using chunked transfer encoding), depending on the capabilities of the clients and the cluster. + +The default HTTP transport method does not support streaming. You must install the [`transport-reactor-netty4`]({{site.url}}{{site.baseurl}}/install-and-configure/configuring-opensearch/network-settings/#selecting-the-transport) HTTP transport plugin and use it as the default HTTP transport layer. Both the `transport-reactor-netty4` plugin and the Streaming Bulk API are experimental. +{: .note} + +## Path and HTTP methods + +```json +POST _bulk/stream +POST /_bulk/stream +``` + +If you specify the index in the path, then you don't need to include it in the [request body chunks]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/#request-body). + +OpenSearch also accepts PUT requests to the `_bulk/stream` path, but we highly recommend using POST. The accepted usage of PUT---adding or replacing a single resource on a given path---doesn't make sense for streaming bulk requests. +{: .note } + + +## Query parameters + +The following table lists the available query parameters. All query parameters are optional. + +Parameter | Data type | Description +:--- | :--- | :--- +`pipeline` | String | The pipeline ID for preprocessing documents. +`refresh` | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` causes the changes show up in search results immediately but degrades cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance isn't degraded. +`require_alias` | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`. +`routing` | String | Routes the request to the specified shard. +`timeout` | Time | How long to wait for the request to return. Default is `1m`. +`type` | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using the `_doc` type for all indexes. +`wait_for_active_shards` | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is `1` (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have 2 replicas distributed across 2 additional nodes in order for the request to succeed. +`batch_interval` | Time | Specifies for how long bulk operations should be accumulated into a batch before sending the batch to data nodes. +`batch_size` | Time | Specifies how many bulk operations should be accumulated into a batch before sending the batch to data nodes. Default is `1`. +{% comment %}_source | List | asdf +`_source_excludes` | List | asdf +`_source_includes` | List | asdf{% endcomment %} + +## Request body + +The Streaming Bulk API request body is fully compatible with the [Bulk API request body]({{site.url}}{{site.baseurl}}/api-reference/document-apis/bulk/#request-body), where each bulk operation (create/index/update/delete) is sent as a separate chunk. + +## Example request + +```json +curl -X POST "http://localhost:9200/_bulk/stream" -H "Transfer-Encoding: chunked" -H "Content-Type: application/json" -d' +{ "delete": { "_index": "movies", "_id": "tt2229499" } } +{ "index": { "_index": "movies", "_id": "tt1979320" } } +{ "title": "Rush", "year": 2013 } +{ "create": { "_index": "movies", "_id": "tt1392214" } } +{ "title": "Prisoners", "year": 2013 } +{ "update": { "_index": "movies", "_id": "tt0816711" } } +{ "doc" : { "title": "World War Z" } } +' +``` +{% include copy.html %} + +## Example response + +Depending on the batch settings, each streamed response chunk may report the results of one or many (batch) bulk operations. For example, for the preceding request with no batching (default), the streaming response may appear as follows: + +```json +{"took": 11, "errors": false, "items": [ { "index": {"_index": "movies", "_id": "tt1979320", "_version": 1, "result": "created", "_shards": { "total": 2 "successful": 1, "failed": 0 }, "_seq_no": 1, "_primary_term": 1, "status": 201 } } ] } +{"took": 2, "errors": true, "items": [ { "create": { "_index": "movies", "_id": "tt1392214", "status": 409, "error": { "type": "version_conflict_engine_exception", "reason": "[tt1392214]: version conflict, document already exists (current version [1])", "index": "movies", "shard": "0", "index_uuid": "yhizhusbSWmP0G7OJnmcLg" } } } ] } +{"took": 4, "errors": true, "items": [ { "update": { "_index": "movies", "_id": "tt0816711", "status": 404, "error": { "type": "document_missing_exception", "reason": "[_doc][tt0816711]: document missing", "index": "movies", "shard": "0", "index_uuid": "yhizhusbSWmP0G7OJnmcLg" } } } ] } +``` diff --git a/_api-reference/document-apis/bulk.md b/_api-reference/document-apis/bulk.md index 0475aa573d..4add60ee37 100644 --- a/_api-reference/document-apis/bulk.md +++ b/_api-reference/document-apis/bulk.md @@ -53,16 +53,16 @@ All bulk URL parameters are optional. Parameter | Type | Description :--- | :--- | :--- pipeline | String | The pipeline ID for preprocessing documents. -refresh | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` makes the changes show up in search results immediately, but hurts cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance doesn't suffer. +refresh | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` causes the changes show up in search results immediately but degrades cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance isn't degraded. require_alias | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`. routing | String | Routes the request to the specified shard. -timeout | Time | How long to wait for the request to return. Default `1m`. -type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using a type of `_doc` for all indexes. -wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed. +timeout | Time | How long to wait for the request to return. Default is `1m`. +type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using the `_doc` type for all indexes. +wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is `1` (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have 2 replicas distributed across 2 additional nodes in order for the request to succeed. batch_size | Integer | **(Deprecated)** Specifies the number of documents to be batched and sent to an ingest pipeline to be processed together. Default is `2147483647` (documents are ingested by an ingest pipeline all at once). If the bulk request doesn't explicitly specify an ingest pipeline or the index doesn't have a default ingest pipeline, then this parameter is ignored. Only documents with `create`, `index`, or `update` actions can be grouped into batches. {% comment %}_source | List | asdf -_source_excludes | list | asdf -_source_includes | list | asdf{% endcomment %} +_source_excludes | List | asdf +_source_includes | List | asdf{% endcomment %} ## Request body From 4c1e7825e8c31378aef8dcefb9f33869db591f3e Mon Sep 17 00:00:00 2001 From: bowenlan-amzn Date: Fri, 13 Sep 2024 05:58:20 -0700 Subject: [PATCH 3/4] Terms query can accept encoded terms input as bitmap (#8133) * draft Signed-off-by: bowenlan-amzn * Doc review Signed-off-by: Fanit Kolchina * Update _query-dsl/term/terms.md Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: bowenlan-amzn Signed-off-by: Fanit Kolchina Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Fanit Kolchina Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower --- _query-dsl/term/terms.md | 134 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/_query-dsl/term/terms.md b/_query-dsl/term/terms.md index 42c74c0436..7dac6a9619 100644 --- a/_query-dsl/term/terms.md +++ b/_query-dsl/term/terms.md @@ -39,6 +39,7 @@ Parameter | Data type | Description :--- | :--- | :--- `` | String | The field in which to search. A document is returned in the results only if its field value exactly matches at least one term, with the correct spacing and capitalization. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. +`value_type` | String | Specifies the types of values used for filtering. Valid values are `default` and `bitmap`. If omitted, the value defaults to `default`. ## Terms lookup @@ -250,3 +251,136 @@ Parameter | Data type | Description `path` | String | The name of the field from which to fetch field values. Specify nested fields using dot path notation. Required. `routing` | String | Custom routing value of the document from which to fetch field values. Optional. Required if a custom routing value was provided when the document was indexed. `boost` | Floating-point | A floating-point value that specifies the weight of this field toward the relevance score. Values above 1.0 increase the field’s relevance. Values between 0.0 and 1.0 decrease the field’s relevance. Default is 1.0. + +## Bitmap filtering +**Introduced 2.17** +{: .label .label-purple } + +The `terms` query can filter for multiple terms simultaneously. However, when the number of terms in the input filter increases to a large value (around 10,000), the resulting network and memory overhead can become significant, making the query inefficient. In such cases, consider encoding your large terms filter using a [roaring bitmap](https://github.com/RoaringBitmap/RoaringBitmap) for more efficient filtering. + +The following example assumes that you have two indexes: a `products` index, which contains all the products sold by a company, and a `customers` index, which stores filters representing customers who own specific products. + +First, create a `products` index and map `product_id` as a `keyword`: + +```json +PUT /products +{ + "mappings": { + "properties": { + "product_id": { "type": "keyword" } + } + } +} +``` +{% include copy-curl.html %} + +Next, index three documents that correspond to products: + +```json +PUT students/_doc/1 +{ + "name": "Product 1", + "product_id" : "111" +} +``` +{% include copy-curl.html %} + +```json +PUT students/_doc/2 +{ + "name": "Product 2", + "product_id" : "222" +} +``` +{% include copy-curl.html %} + +```json +PUT students/_doc/3 +{ + "name": "Product 3", + "product_id" : "333" +} +``` +{% include copy-curl.html %} + +To store customer bitmap filters, you'll create a `customer_filter` [binary field](https://opensearch.org/docs/latest/field-types/supported-field-types/binary/) in the `customers` index. Specify `store` as `true` to store the field: + +```json +PUT /customers +{ + "mappings": { + "properties": { + "customer_filter": { + "type": "binary", + "store": true + } + } + } +} +``` +{% include copy-curl.html %} + +For each customer, you need to generate a bitmap that represents the product IDs of the products the customer owns. This bitmap effectively encodes the filter criteria for that customer. In this example, you'll create a `terms` filter for a customer whose ID is `customer123` and who owns products `111`, `222`, and `333`. + +To encode a `terms` filter for the customer, first create a roaring bitmap for the filter. This example creates a bitmap using the [PyRoaringBitMap] library, so first run `pip install pyroaring` to install the library. Then serialize the bitmap and encode it using a [Base64](https://en.wikipedia.org/wiki/Base64) encoding scheme: + +```py +from pyroaring import BitMap +import base64 + +# Create a bitmap, serialize it into a byte string, and encode into Base64 +bm = BitMap([111, 222, 333]) # product ids owned by a customer +encoded = base64.b64encode(BitMap.serialize(bm)) + +# Convert the Base64-encoded bytes to a string for storage or transmission +encoded_bm_str = encoded.decode('utf-8') + +# Print the encoded bitmap +print(f"Encoded Bitmap: {encoded_bm_str}") +``` +{% include copy.html %} + +Next, index the customer filter into the `customers` index. The document ID for the filter is the same as the ID for the corresponding customer (in this example, `customer123`). The `customer_filter` field contains the bitmap you generated for this customer: + +```json +POST customers/_doc/customer123 +{ + "customer_filter": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==" +} +``` +{% include copy-curl.html %} + +Now you can run a `terms` query on the `products` index to look up a specific customer in the `customers` index. Because you're looking up a stored field instead of `_source`, set `store` to `true`. In the `value_type` field, specify the data type of the `terms` input as `bitmap`: + +```json +POST /products/_search +{ + "query": { + "terms": { + "product_id": { + "index": "customers", + "id": "customer123", + "path": "customer_filter", + "store": true + }, + "value_type": "bitmap" + } + } +} +``` +{% include copy-curl.html %} + +You can also directly pass the bitmap to the `terms` query. In this example, the `product_id` field contains the customer filter bitmap for the customer whose ID is `customer123`: + +```json +POST /products/_search +{ + "query": { + "terms": { + "product_id": "OjAAAAEAAAAAAAIAEAAAAG8A3gBNAQ==", + "value_type": "bitmap" + } + } +} +``` +{% include copy-curl.html %} \ No newline at end of file From 27c02f95de42598c518bb446fce043869daab51a Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Fri, 13 Sep 2024 06:04:12 -0700 Subject: [PATCH 4/4] Derived field updates for 2.17 (#8244) * update aggregation support in limitations and add sample req/response. Signed-off-by: Marc Handalian * remove concurrent search limitation Signed-off-by: Marc Handalian * minor updates Signed-off-by: Marc Handalian * fix style job error Signed-off-by: Marc Handalian * Doc review Signed-off-by: Fanit Kolchina * Clarify the list of unsupported geo aggregations Signed-off-by: Fanit Kolchina * Apply suggestions from code review Co-authored-by: Nathan Bower Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> --------- Signed-off-by: Marc Handalian Signed-off-by: Fanit Kolchina Signed-off-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Fanit Kolchina Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Co-authored-by: Nathan Bower --- _field-types/supported-field-types/derived.md | 78 ++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/_field-types/supported-field-types/derived.md b/_field-types/supported-field-types/derived.md index d989c3e4a4..cb358f47f9 100644 --- a/_field-types/supported-field-types/derived.md +++ b/_field-types/supported-field-types/derived.md @@ -28,11 +28,11 @@ Despite the potential performance impact of query-time computations, the flexibi Currently, derived fields have the following limitations: -- **Aggregation, scoring, and sorting**: Not yet supported. +- **Scoring and sorting**: Not yet supported. +- **Aggregations**: Starting with OpenSearch 2.17, derived fields support most aggregation types. The following aggregations are not supported: geographic (geodistance, geohash grid, geohex grid, geotile grid, geobounds, geocentroid), significant terms, significant text, and scripted metric. - **Dashboard support**: These fields are not displayed in the list of available fields in OpenSearch Dashboards. However, you can still use them for filtering if you know the derived field name. - **Chained derived fields**: One derived field cannot be used to define another derived field. - **Join field type**: Derived fields are not supported for the [join field type]({{site.url}}{{site.baseurl}}/opensearch/supported-field-types/join/). -- **Concurrent segment search**: Derived fields are not supported for [concurrent segment search]({{site.url}}{{site.baseurl}}/search-plugins/concurrent-segment-search/). We are planning to address these limitations in future versions. @@ -541,6 +541,80 @@ The response specifies highlighting in the `url` field: ``` +## Aggregations + +Starting with OpenSearch 2.17, derived fields support most aggregation types. + +Geographic, significant terms, significant text, and scripted metric aggregations are not supported. +{: .note} + +For example, the following request creates a simple `terms` aggregation on the `method` derived field: + +```json +POST /logs/_search +{ + "size": 0, + "aggs": { + "methods": { + "terms": { + "field": "method" + } + } + } +} +``` +{% include copy-curl.html %} + +The response contains the following buckets: + +
+ + Response + + {: .text-delta} + +```json +{ + "took" : 12, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 5, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "methods" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "GET", + "doc_count" : 2 + }, + { + "key" : "POST", + "doc_count" : 2 + }, + { + "key" : "DELETE", + "doc_count" : 1 + } + ] + } + } +} +``` +
+ ## Performance Derived fields are not indexed but are computed dynamically by retrieving values from the `_source` field or doc values. Thus, they run more slowly. To improve performance, try the following: