Don't create Elasticsearch span names containing document IDs (#705)

* Fix typo: _DEFALT_OP_NAME * Extract ES document ID from URL, put in attributes Elasticsearch creates URLs for index() and delete() before they hit perform_request(). This means there would be many unique span names containing unique document IDs, of the form 'Elasticsearch/indexname/_doc/documentid'. This extracts the document ID from the URL and replaces it with ':id', then puts it in the span's attributes. * Add TODO comment with link to issue * Add CHANGELOG entry * Don't use custom doc types, deprecated in ES 7 * Update tests to match instrumentation
open-telemetry · Oct 12, 2021 · 1b75672 · 1b75672
1 parent 4046fdb
commit 1b75672
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   ([679](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/679))
 - `opentelemetry-exporter-richconsole` Initial release
   ([#686](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/686))
+- `opentelemetry-instrumentation-elasticsearch` no longer creates unique span names by including document IDs, replaces them with `:id` and puts the value in attribute `elasticsearch.id`
+  ([#705](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/705))
 - `opentelemetry-instrumentation-tornado` now sets `http.client_ip` and `tornado.handler` attributes
   ([#706](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/706))
 - `opentelemetry-instrumentation-requests` added exclude urls functionality

diff --git a/...instrumentation-elasticsearch/src/opentelemetry/instrumentation/elasticsearch/__init__.py b/...instrumentation-elasticsearch/src/opentelemetry/instrumentation/elasticsearch/__init__.py
@@ -81,6 +81,7 @@ def response_hook(span, response):
     es.get(index='my-index', doc_type='my-type', id=1)
 """
 
+import re
 from logging import getLogger
 from os import environ
 from typing import Collection
@@ -107,7 +108,7 @@ def response_hook(span, response):
     "took",
 ]
 
-_DEFALT_OP_NAME = "request"
+_DEFAULT_OP_NAME = "request"
 
 
 class ElasticsearchInstrumentor(BaseInstrumentor):
@@ -146,6 +147,9 @@ def _uninstrument(self, **kwargs):
         unwrap(elasticsearch.Transport, "perform_request")
 
 
+_regex_doc_url = re.compile(r"/_doc/([^/]+)")
+
+
 def _wrap_perform_request(
     tracer, span_name_prefix, request_hook=None, response_hook=None
 ):
@@ -161,7 +165,24 @@ def wrapper(wrapped, _, args, kwargs):
                 len(args),
             )
 
-        op_name = span_name_prefix + (url or method or _DEFALT_OP_NAME)
+        op_name = span_name_prefix + (url or method or _DEFAULT_OP_NAME)
+        doc_id = None
+        if url:
+            # TODO: This regex-based solution avoids creating an unbounded number of span names, but should be replaced by instrumenting individual Elasticsearch methods instead of Transport.perform_request()
+            # A limitation of the regex is that only the '_doc' mapping type is supported. Mapping types are deprecated since Elasticsearch 7
+            # https://github.com/open-telemetry/opentelemetry-python-contrib/issues/708
+            match = _regex_doc_url.search(url)
+            if match is not None:
+                # Remove the full document ID from the URL
+                doc_span = match.span()
+                op_name = (
+                    span_name_prefix
+                    + url[: doc_span[0]]
+                    + "/_doc/:id"
+                    + url[doc_span[1] :]
+                )
+                # Put the document ID in attributes
+                doc_id = match.group(1)
         params = kwargs.get("params", {})
         body = kwargs.get("body", None)
 
@@ -184,6 +205,8 @@ def wrapper(wrapped, _, args, kwargs):
                     attributes[SpanAttributes.DB_STATEMENT] = str(body)
                 if params:
                     attributes["elasticsearch.params"] = str(params)
+                if doc_id:
+                    attributes["elasticsearch.id"] = doc_id
                 for key, value in attributes.items():
                     span.set_attribute(key, value)
 

diff --git a/instrumentation/opentelemetry-instrumentation-elasticsearch/tests/test_elasticsearch.py b/instrumentation/opentelemetry-instrumentation-elasticsearch/tests/test_elasticsearch.py
@@ -64,7 +64,7 @@ def test_instrumentor(self, request_mock):
         request_mock.return_value = (1, {}, {})
 
         es = Elasticsearch()
-        es.index(index="sw", doc_type="people", id=1, body={"name": "adam"})
+        es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})
 
         spans_list = self.get_finished_spans()
         self.assertEqual(len(spans_list), 1)
@@ -79,7 +79,7 @@ def test_instrumentor(self, request_mock):
         # check that no spans are generated after uninstrument
         ElasticsearchInstrumentor().uninstrument()
 
-        es.index(index="sw", doc_type="people", id=1, body={"name": "adam"})
+        es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})
 
         spans_list = self.get_finished_spans()
         self.assertEqual(len(spans_list), 1)
@@ -119,7 +119,7 @@ def test_prefix_env(self, request_mock):
 
     def _test_prefix(self, prefix):
         es = Elasticsearch()
-        es.index(index="sw", doc_type="people", id=1, body={"name": "adam"})
+        es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})
 
         spans_list = self.get_finished_spans()
         self.assertEqual(len(spans_list), 1)
@@ -133,11 +133,12 @@ def test_result_values(self, request_mock):
             '{"found": false, "timed_out": true, "took": 7}',
         )
         es = Elasticsearch()
-        es.get(index="test-index", doc_type="tweet", id=1)
+        es.get(index="test-index", doc_type="_doc", id=1)
 
         spans = self.get_finished_spans()
 
         self.assertEqual(1, len(spans))
+        self.assertEqual(spans[0].name, "Elasticsearch/test-index/_doc/:id")
         self.assertEqual("False", spans[0].attributes["elasticsearch.found"])
         self.assertEqual(
             "True", spans[0].attributes["elasticsearch.timed_out"]
@@ -159,7 +160,7 @@ def test_trace_error_not_found(self, request_mock):
     def _test_trace_error(self, code, exc):
         es = Elasticsearch()
         try:
-            es.get(index="test-index", doc_type="tweet", id=1)
+            es.get(index="test-index", doc_type="_doc", id=1)
         except Exception:  # pylint: disable=broad-except
             pass
 
@@ -176,15 +177,13 @@ def test_parent(self, request_mock):
         request_mock.return_value = (1, {}, {})
         es = Elasticsearch()
         with self.tracer.start_as_current_span("parent"):
-            es.index(
-                index="sw", doc_type="people", id=1, body={"name": "adam"}
-            )
+            es.index(index="sw", doc_type="_doc", id=1, body={"name": "adam"})
 
         spans = self.get_finished_spans()
         self.assertEqual(len(spans), 2)
 
         parent = spans.by_name("parent")
-        child = spans.by_name("Elasticsearch/sw/people/1")
+        child = spans.by_name("Elasticsearch/sw/_doc/:id")
         self.assertIsNotNone(child.parent)
         self.assertEqual(child.parent.span_id, parent.context.span_id)
 
@@ -198,13 +197,13 @@ def test_multithread(self, request_mock):
         # 3. Check the spans got different parents, and are in the expected order.
         def target1(parent_span):
             with trace.use_span(parent_span):
-                es.get(index="test-index", doc_type="tweet", id=1)
+                es.get(index="test-index", doc_type="_doc", id=1)
                 ev.set()
                 ev.wait()
 
         def target2():
             ev.wait()
-            es.get(index="test-index", doc_type="tweet", id=2)
+            es.get(index="test-index", doc_type="_doc", id=2)
             ev.set()
 
         with self.tracer.start_as_current_span("parent") as span:
@@ -220,8 +219,8 @@ def target2():
         self.assertEqual(3, len(spans))
 
         s1 = spans.by_name("parent")
-        s2 = spans.by_name("Elasticsearch/test-index/tweet/1")
-        s3 = spans.by_name("Elasticsearch/test-index/tweet/2")
+        s2 = spans.by_attr("elasticsearch.id", "1")
+        s3 = spans.by_attr("elasticsearch.id", "2")
 
         self.assertIsNotNone(s2.parent)
         self.assertEqual(s2.parent.span_id, s1.context.span_id)
@@ -343,10 +342,9 @@ def request_hook(span, method, url, kwargs):
         )
         es = Elasticsearch()
         index = "test-index"
-        doc_type = "tweet"
         doc_id = 1
         kwargs = {"params": {"test": True}}
-        es.get(index=index, doc_type=doc_type, id=doc_id, **kwargs)
+        es.get(index=index, doc_type="_doc", id=doc_id, **kwargs)
 
         spans = self.get_finished_spans()
 
@@ -355,7 +353,7 @@ def request_hook(span, method, url, kwargs):
             "GET", spans[0].attributes[request_hook_method_attribute]
         )
         self.assertEqual(
-            f"/{index}/{doc_type}/{doc_id}",
+            f"/{index}/_doc/{doc_id}",
             spans[0].attributes[request_hook_url_attribute],
         )
         self.assertEqual(
@@ -390,7 +388,7 @@ def response_hook(span, response):
                 "hits": [
                     {
                         "_index": "test-index",
-                        "_type": "tweet",
+                        "_type": "_doc",
                         "_id": "1",
                         "_score": 0.18232156,
                         "_source": {"name": "tester"},
@@ -405,7 +403,7 @@ def response_hook(span, response):
             json.dumps(response_payload),
         )
         es = Elasticsearch()
-        es.get(index="test-index", doc_type="tweet", id=1)
+        es.get(index="test-index", doc_type="_doc", id=1)
 
         spans = self.get_finished_spans()