Add safer path operators and improve documentation

satabin · satabin · commit 0a89bb37efc9 · 2025-03-11T19:29:04.000+01:00
The documentation now does not bring the unsafe `raw` operators first,
and explains better what are the existing operators and what to be
careful about.

This also adds a new `through` operator, that allows for streaming
handling of path matches.
diff --git a/finite-state/shared/src/main/scala/fs2/data/pfsa/TreeQueryPipe.scala b/finite-state/shared/src/main/scala/fs2/data/pfsa/TreeQueryPipe.scala
@@ -138,6 +138,9 @@ private[data] abstract class TreeQueryPipe[F[_]: Concurrent, T, O <: T, Matcher,
   final def topmost(s: Stream[F, T]): Stream[F, T] =
     raw(Int.MaxValue, 0)(s).parJoinUnbounded
 
+  final def through(s: Stream[F, T], pipe: Pipe[F, T, Nothing], maxMatch: Int, maxNest: Int): Stream[F, Nothing] =
+    raw(maxMatch = maxMatch, maxNest = maxNest)(s).map(_.through(pipe)).parJoinUnbounded
+
   final def aggregate[U](s: Stream[F, T],
                          f: Stream[F, T] => F[U],
                          deterministic: Boolean,
diff --git a/json/src/main/scala/fs2/data/json/jsonpath/package.scala b/json/src/main/scala/fs2/data/json/jsonpath/package.scala
@@ -49,14 +49,20 @@ package object jsonpath {
       * E.g., if you want to emit only the top most matches, set it to `0`.
       *
       * '''Warning''': make sure you actually consume all the emitted streams otherwise
-      * this can lead to memory problems.
+      * this can lead to memory problems. The streams must all be consumed in parallel
+      * to avoid hanging programs.
       */
-    def raw(path: JsonPath, maxMatch: Int = Int.MaxValue, maxNest: Int = Int.MaxValue)(implicit
+    def unsafeRaw(path: JsonPath, maxMatch: Int = Int.MaxValue, maxNest: Int = Int.MaxValue)(implicit
         F: Concurrent[F]): Pipe[F, Token, Stream[F, Token]] =
       _.through(JsonTagger.pipe)
         .through(new JsonQueryPipe(compileJsonPath(path)).raw(maxMatch, maxNest)(_))
         .map(_.map(untag(_)).unNone)
 
+    @deprecated(message = "Use `filter.unsafeRaw()` instead", since = "fs2-data 1.12.0")
+    def raw(path: JsonPath, maxMatch: Int = Int.MaxValue, maxNest: Int = Int.MaxValue)(implicit
+        F: Concurrent[F]): Pipe[F, Token, Stream[F, Token]] =
+      unsafeRaw(path = path, maxMatch = maxMatch, maxNest = maxNest)
+
     /** Selects the first match in the input stream. The tokens of the first matching
       * value are emitted as they are read.
       *
@@ -96,6 +102,25 @@ package object jsonpath {
                        maxNest))
         .flatMap(Stream.emits(_))
 
+    /** Selects all matching elements in the input stream, feeding them to the provided [[fs2.Pipe]] in parallel.
+      * Each match results in a new stream of [[fs2.data.json.Token Token]] fed to the `pipe`. All the matches are processed in parallel as soon as new tokens are available.
+      *
+      * The `maxMatch` parameter controls how many matches are to be emitted at most.
+      * Further matches won't be emitted if any.
+      *
+      * The `maxNest` parameter controls the maximum level of match nesting to be emitted.
+      * E.g., if you want to emit only the top most matches, set it to `0`.
+      *
+      */
+    def through(path: JsonPath,
+                pipe: Pipe[F, Token, Nothing],
+                maxMatch: Int = Int.MaxValue,
+                maxNest: Int = Int.MaxValue)(implicit F: Concurrent[F]): Pipe[F, Token, Nothing] =
+      _.through(JsonTagger.pipe)
+        .through(
+          new JsonQueryPipe(compileJsonPath(path))
+            .through(_, _.map(untag(_)).unNone.through(pipe), maxMatch, maxNest))
+
     /** Selects all matching elements in the input stream, and applies the [[fs2.Collector]] to it.
       *
       * If `deterministic` is set to `true` (default value), elements are emitted in the order they
diff --git a/site/documentation/json/jsonpath.md b/site/documentation/json/jsonpath.md
@@ -72,25 +72,28 @@ The supported JSONPath features are:
 Using the path defined above, we can filter the stream of tokens, to only emit selected tokens downstream. This can be used to drastically reduce the amount of emitted data, to only the parts that are of interest for you.
 The filtering pipes are located in the `fs2.data.json.jsonpath.filter` namespace.
 
-Since JSONPath includes a recursive descent operator, there can be nested matches for your path.
-The `filter.raw` emits a stream of all matches.
-Each match is represented as a nested stream of JSON tokens which must be consumed.
+The main operators in the namespace are:
 
-```scala mdoc
-import fs2.data.json.jsonpath.filter
+ - `filter.first(path)` which is a `Pipe` returning the tokens of the first match only.
+ - `filter.collect(path, collector)` which uses the provided `collector` to aggregate the tokens of each match, and emits all the aggregated results.
+ - `filter.values[Json](path)` which builds the AST for each match for any type `Json` with a [`Builder`][json-builder] in scope.
+ - `filter.through(path, pipe)` which sends all matches as a stream through the provided `pipe`.
 
-import cats.effect._
-import cats.effect.unsafe.implicits.global
+@:callout(info)
+Since JSONPath includes a recursive descent operator, there can be nested matches for your path.
+The matches are returned in the order their first matching token is encountered in the input.
+This means that for nested matches, the first stream returned is the ancestor element.
+@:@
 
-val filtered = stream.lift[IO].through(filter.raw(path)).parEvalMapUnbounded(_.compile.toList)
-filtered.compile.toList.unsafeRunSync()
-```
 
-The matching streams are returned in the order their matching element is encountered in the input.
-This means that for nested matches, the first stream returned is the ancestor element.
+Using `filter.collect`, you can build a stream that collects each match for the provided collector and emits the aggregated result. For instance, to build the list of string representations of the matches, you can run the following code.
 
 ```scala mdoc
 import fs2.data.json.literals._
+import fs2.data.json.jsonpath.filter
+
+import cats.effect._
+import cats.effect.unsafe.implicits.global
 
 val recursive = jsonpath"$$..a"
 
@@ -106,34 +109,75 @@ val json = json"""{
 
 json
   .lift[IO]
-  .through(filter.raw(recursive))
-  .parEvalMapUnbounded(_.compile.toList)
+  .through(filter.collect(recursive, List))
   .compile
   .toList
   .unsafeRunSync()
 ```
 
-This is actually a common use case, so the library offers `filter.collect` to have this behavior for any collector.
+If you want to have results emitted as early as possible instead of in order, you can set the `deterministic` parameter to `false`.
 
 ```scala mdoc
 json
   .lift[IO]
-  .through(filter.collect(recursive, List))
+  .through(filter.collect(recursive, List, deterministic = false))
   .compile
   .toList
   .unsafeRunSync()
 ```
 
-If you want to have results emitted as early as possible instead of in order, you can set the `deterministic` parameter to `false`.
+The `filter.through` operator allows for handling each match in a streaming fashion.
+For instance, let's say you want to save each match in a file, incrementing a counter on each match. You can run the following code.
+
+```scala mdoc
+import fs2.io.file.{Files, Path}
+
+def saveJson(counter: Ref[IO, Int], tokens: Stream[IO, Token]): Stream[IO, Nothing] =
+  Stream.eval(counter.getAndUpdate(_ + 1)).flatMap { index =>
+   tokens 
+      .through(render.compact)
+      .through(Files[IO].writeUtf8(Path(s"match-$index.json")))
+  }
+
+val program =
+  for {
+    counter <- Ref[IO].of(0)
+    _ <- json
+      .lift[IO]
+      .through(filter.through(recursive, saveJson(counter, _)))
+      .compile
+      .drain
+  } yield ()
+
+program.unsafeRunSync()
+
+Files[IO].readUtf8(Path("match-0.json")).compile.string.unsafeRunSync()
+Files[IO].readUtf8(Path("match-1.json")).compile.string.unsafeRunSync()
+```
+
+@:callout(warning)
+The operator described below is unsafe and should be used carefully only if none of the above operators fits your purpose.
+When using it, please ensure that you:
+
+ - consume **all** inner `Stream`s
+ - consume them in **parallel** (e.g. with a variant of `parEvalMap` and paralellism >1, or with a variant of `parJoin`).
+
+Failure to do so might result in memory leaks or hanging programs.
+@:@
+
+The `filter.unsafeRaw` emits a stream of all matches.
+Each match is represented as a nested stream of JSON tokens which must be consumed.
 
 ```scala mdoc
+
 json
   .lift[IO]
-  .through(filter.collect(recursive, List, deterministic = false))
+  .through(filter.unsafeRaw(recursive))
+  .parEvalMapUnbounded(_.compile.toList)
   .compile
   .toList
   .unsafeRunSync()
 ```
-
 [monad-error]: https://typelevel.org/cats/api/cats/MonadError.html
 [jsonpath]: https://goessner.net/articles/JsonPath/index.html
+[json-builder]: index.md#ast-builder-and-tokenizer
diff --git a/site/documentation/xml/xpath.md b/site/documentation/xml/xpath.md
@@ -79,47 +79,96 @@ You can use parentheses to associate differently, for instance `!(p1 && p2) || p
 Using the path defined above, we can filter the stream of events, to only emit selected tokens downstream. This can be used to drastically reduce the amount of emitted data, to only the parts that are of interest for you.
 The filtering pipes are located in the `fs2.data.xml.xpath.filter` namespace.
 
-Since XPath includes a recursive descent operator, there can be nested matches for your path.
-The `filter.raw` emits a stream of all matches.
-Each match is represented as a nested stream of XML events which must be consumed.
+The main operators in the namespace are:
+
+ - `filter.first(xpath)` which is a `Pipe` returning the events of the first match only.
+ - `filter.collect(xpath, collector)` which uses the provided `collector` to aggregate the events of each match, and emits all the aggregated results.
+ - `filter.dom[Node](xpath)` which builds the DOM for each match for any DOM type `Node` with a [`DocumentBuilder`][dom-builder] in scope.
+ - `filter.through(xpath, pipe)` which sends all matches as a stream through the provided `pipe`.
+
+@:callout(info)
+Since XPath includes a recursive descent operator, there can be nested matches for your xpath.
+The matches are returned in the order their opening matching element is encountered in the input by default.
+This means that for nested matches, the first stream returned is the ancestor element.
+@:@
+
+Using `filter.collect`, you can build a stream that collects each match for the provided collector and emits the aggregated result. For instance, to build the list of string representations of the matches, you can run the following code.
 
 ```scala mdoc
 import cats.effect._
 import cats.effect.unsafe.implicits.global
 
 stream
   .lift[IO]
-  .through(filter.raw(path))
-  .parEvalMapUnbounded(_.through(render.raw()).compile.foldMonoid)
+  .through(filter.collect(path, collector.raw()))
   .compile
   .toList
   .unsafeRunSync()
 ```
 
-The matching streams are returned in the order their matching element is encountered in the input.
-This means that for nested matches, the first stream returned is the ancestor element.
-
-The library offers `filter.collect` to collect each match for any collector.
+If you want to have results emitted as early as possible instead of in order, you can set the `deterministic` parameter to `false`.
 
 ```scala mdoc
 stream
   .lift[IO]
-  .through(filter.collect(path, collector.raw()))
+  .through(filter.collect(path, collector.raw(), deterministic = false))
   .compile
   .toList
   .unsafeRunSync()
 ```
 
-If you want to have results emitted as early as possible instead of in order, you can set the `deterministic` parameter to `false`.
+The `filter.through` operator allows for handling each match in a streaming fashion.
+For instance, let's say you want to save each match in a file, incrementing a counter on each match. You can run the following code.
+
+```scala mdoc
+import fs2.io.file.{Files, Path}
+
+def saveXml(counter: Ref[IO, Int], events: Stream[IO, XmlEvent]): Stream[IO, Nothing] =
+  Stream.eval(counter.getAndUpdate(_ + 1)).flatMap { index =>
+    events
+      .through(render.raw())
+      .through(Files[IO].writeUtf8(Path(s"match-$index.xml")))
+  }
+
+val program =
+  for {
+    counter <- Ref[IO].of(0)
+    _ <- stream
+      .lift[IO]
+      .through(filter.through(path, saveXml(counter, _)))
+      .compile
+      .drain
+  } yield ()
+
+program.unsafeRunSync()
+
+Files[IO].readUtf8(Path("match-0.xml")).compile.string.unsafeRunSync()
+Files[IO].readUtf8(Path("match-1.xml")).compile.string.unsafeRunSync()
+```
+
+@:callout(warning)
+The operator described below is unsafe and should be used carefully only if none of the above operators fits your purpose.
+When using it, please ensure that you:
+
+ - consume **all** inner `Stream`s
+ - consume them in **parallel** (e.g. with a variant of `parEvalMap` and paralellism >1, or with a variant of `parJoin`).
+
+Failure to do so might result in memory leaks or hanging programs.
+@:@
+
+The `filter.unsafeRaw` operator emits a stream of all matches.
+Each match is represented as a nested stream of XML events which must be consumed.
 
 ```scala mdoc
 stream
   .lift[IO]
-  .through(filter.collect(path, collector.raw(), deterministic = false))
+  .through(filter.unsafeRaw(path))
+  .parEvalMapUnbounded(_.through(render.raw()).compile.foldMonoid)
   .compile
   .toList
   .unsafeRunSync()
 ```
 
 [monad-error]: https://typelevel.org/cats/api/cats/MonadError.html
 [xpath]: https://www.w3.org/TR/xpath/
+[dom-builder]: index.md#dom-builder-and-eventifier
diff --git a/xml/src/main/scala/fs2/data/xml/xpath/package.scala b/xml/src/main/scala/fs2/data/xml/xpath/package.scala
@@ -48,12 +48,18 @@ package object xpath {
       * E.g., if you want to emit only the top most matches, set it to `0`.
       *
       * '''Warning''': make sure you actually consume all the emitted streams otherwise
-      * this can lead to memory problems.
+      * this can lead to memory problems. The streams must all be consumed in parallel
+      * to avoid hanging programs.
       */
-    def raw(path: XPath, maxMatch: Int = Int.MaxValue, maxNest: Int = Int.MaxValue)(implicit
+    def unsafeRaw(path: XPath, maxMatch: Int = Int.MaxValue, maxNest: Int = Int.MaxValue)(implicit
         F: Concurrent[F]): Pipe[F, XmlEvent, Stream[F, XmlEvent]] =
       new XmlQueryPipe(compileXPath(path)).raw(maxMatch, maxNest)(_)
 
+    @deprecated(message = "Use `filter.unsafeRaw()` instead", since = "fs2-data 1.12.0")
+    def raw(path: XPath, maxMatch: Int = Int.MaxValue, maxNest: Int = Int.MaxValue)(implicit
+        F: Concurrent[F]): Pipe[F, XmlEvent, Stream[F, XmlEvent]] =
+      unsafeRaw(path = path, maxMatch = maxMatch, maxNest = maxNest)
+
     /** Selects the first match only. First is meant as in: opening tag appears first in the input, no matter the depth.
       * Tokens of the first match are emitted as they are read from the input.
       *
@@ -84,6 +90,22 @@ package object xpath {
         .aggregate(_, _.through(xml.dom.elements).compile.toList, deterministic, maxMatch, maxNest)
         .flatMap(Stream.emits(_))
 
+    /** Selects all matching elements in the input stream, feeding them to the provided [[fs2.Pipe]] in parallel.
+      * Each match results in a new stream of [[fs2.data.xml.XmlEvent XmlEvent]] fed to the `pipe`. All the matches are processed in parallel as soon as new events are available.
+      *
+      * The `maxMatch` parameter controls how many matches are to be emitted at most.
+      * Further matches won't be emitted if any.
+      *
+      * The `maxNest` parameter controls the maximum level of match nesting to be emitted.
+      * E.g., if you want to emit only the top most matches, set it to `0`.
+      *
+      */
+    def through(path: XPath,
+                pipe: Pipe[F, XmlEvent, Nothing],
+                maxMatch: Int = Int.MaxValue,
+                maxNest: Int = Int.MaxValue)(implicit F: Concurrent[F]): Pipe[F, XmlEvent, Nothing] =
+      new XmlQueryPipe(compileXPath(path)).through(_, pipe, maxMatch, maxNest)
+
     /** Selects all matching elements in the input stream, and applies the [[fs2.Collector]] to it.
       *
       * If `deterministic` is set to `true` (default value), elements are emitted in the order they