Skip to content

Commit 111456a

Browse files
committed
Avoid per-byte loop in cstring{,Utf8} builders
Copy chunks of the input to the output buffer with, up to the shorter of the available buffer space and the "null-free" portion of the remaining string. Actually "null-free" here means not containing any denormalised two-byte encodings starting with 0xC0 (so possibly also other ASCII bytes if the UTF-8 encoding is oddball). This substantially improves performance, with just one "15%" increase that looks like a spurious measurement error (perhaps code layout difference artefact). UTF-8 String (12B): OK 16.7 ns ± 1.3 ns, 60% less than baseline UTF-8 String (64B, one null): OK 22.6 ns ± 1.3 ns, 87% less than baseline UTF-8 String (64B, one null, no shared work): OK 30.1 ns ± 2.6 ns, 83% less than baseline UTF-8 String (64B, half nulls): OK 92.6 ns ± 5.3 ns, 49% less than baseline UTF-8 String (64B, all nulls): OK 76.3 ns ± 4.5 ns, 57% less than baseline UTF-8 String (64B, all nulls, no shared work): OK 82.3 ns ± 5.6 ns, 54% less than baseline ASCII String (12B): OK 6.50 ns ± 326 ps, 76% less than baseline ASCII String (64B): OK 8.03 ns ± 334 ps, 94% less than baseline AsciiLit: OK 8.02 ns ± 648 ps, 94% less than baseline Utf8Lit: OK 21.8 ns ± 1.3 ns, 88% less than baseline strLit: OK 8.90 ns ± 788 ps, 94% less than baseline stringUtf8: OK 22.4 ns ± 1.3 ns, 87% less than baseline strLitInline: OK 8.26 ns ± 676 ps, 94% less than baseline utf8LitInline: OK 23.2 ns ± 1.3 ns, 87% less than baseline foldMap byteStringInsert (10000): OK 46.0 μs ± 4.0 μs, 15% less than baseline --> lazyByteStringHex (10000): OK --> 4.74 μs ± 337 ns, 15% more than baseline foldMap integerDec (small) (10000): OK 205 μs ± 12 μs, 9% less than baseline char8 (10000): OK 2.58 μs ± 234 ns, 30% less than baseline foldMap (left-assoc) (10000): OK 73.2 μs ± 2.9 μs, 54% less than baseline foldMap (right-assoc) (10000): OK 43.0 μs ± 4.2 μs, 65% less than baseline foldMap [manually fused, left-assoc] (10000): OK 81.4 μs ± 5.3 μs, 48% less than baseline foldMap [manually fused, right-assoc] (10000): OK 47.3 μs ± 785 ns, 61% less than baseline
1 parent 188512c commit 111456a

File tree

4 files changed

+157
-55
lines changed

4 files changed

+157
-55
lines changed

Data/ByteString/Builder/Internal.hs

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
{-# LANGUAGE Unsafe #-}
2-
{-# LANGUAGE TypeFamilies #-}
1+
{-# LANGUAGE MagicHash #-}
32
{-# LANGUAGE NoMonoLocalBinds #-}
3+
{-# LANGUAGE TypeFamilies #-}
4+
{-# LANGUAGE Unsafe #-}
5+
{-# LANGUAGE ViewPatterns #-}
46

57
{-# OPTIONS_HADDOCK not-home #-}
68

@@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal (
8789
-- , sizedChunksInsert
8890

8991
, byteStringCopy
92+
, asciiLiteralCopy
93+
, modUtf8LitCopy
9094
, byteStringInsert
9195
, byteStringThreshold
9296

@@ -816,6 +820,7 @@ ensureFree :: Int -> Builder
816820
ensureFree minFree =
817821
builder step
818822
where
823+
step :: forall r. BuildStep r -> BuildStep r
819824
step k br@(BufferRange op ope)
820825
| ope `minusPtr` op < minFree = return $ bufferFull minFree op k
821826
| otherwise = k br
@@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k =
839844
where
840845
outRemaining = ope `minusPtr` op
841846

847+
-- | Copy the bytes from a 'BufferRange' into the output stream.
848+
wrappedBufferRangeCopyStep :: BufferRange -- ^ Input 'BufferRange'.
849+
-> BuildStep a -> BuildStep a
850+
wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k =
851+
go ip0
852+
where
853+
go !ip (BufferRange op ope)
854+
| inpRemaining <= outRemaining = do
855+
copyBytes op ip inpRemaining
856+
let !br' = BufferRange (op `plusPtr` inpRemaining) ope
857+
k br'
858+
| otherwise = do
859+
copyBytes op ip outRemaining
860+
let !ip' = ip `plusPtr` outRemaining
861+
return $ bufferFull 1 ope (go ip')
862+
where
863+
outRemaining = ope `minusPtr` op
864+
inpRemaining = ipe `minusPtr` ip
865+
842866

843867
-- Strict ByteStrings
844868
------------------------------------------------------------------------------
@@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
858882
byteStringThreshold maxCopySize =
859883
\bs -> builder $ step bs
860884
where
885+
step :: forall r. S.ByteString -> BuildStep r -> BuildStep r
861886
step bs@(S.BS _ len) k br@(BufferRange !op _)
862887
| len <= maxCopySize = byteStringCopyStep bs k br
863888
| otherwise = return $ insertChunk op bs k
@@ -949,6 +974,87 @@ byteStringInsert :: S.StrictByteString -> Builder
949974
byteStringInsert =
950975
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k
951976

977+
978+
------------------------------------------------------------------------------
979+
-- Raw CString encoding
980+
------------------------------------------------------------------------------
981+
982+
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
983+
-- strings that are free of embedded (overlong-encoded as the two-byte sequence
984+
-- @0xC0 0x80@) null characters.
985+
--
986+
-- @since 0.11.5.0
987+
{-# INLINABLE asciiLiteralCopy #-}
988+
asciiLiteralCopy :: Ptr Word8 -> Int -> Builder
989+
asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) ->
990+
if len <= ope `minusPtr` op
991+
then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope)
992+
else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br
993+
994+
-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings
995+
-- that may contain embedded overlong two-byte encodings of the NUL character
996+
-- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the
997+
-- result is not well defined.
998+
--
999+
-- @since 0.11.5.0
1000+
{-# INLINABLE modUtf8LitCopy #-}
1001+
modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
1002+
modUtf8LitCopy !ip !len
1003+
| len > 0 = builder (modUtf8_step ip len)
1004+
| otherwise = builder id
1005+
1006+
-- | Copy a /non-empty/ UTF-8 input possibly containing denormalised 2-octet
1007+
-- sequences. While only the NUL byte should ever encoded that way (as @0xC0
1008+
-- 80@), this handles other denormalised @0xC0 0x??@ sequences by keeping the
1009+
-- bottom 6 bits of the second byte. If the input is non-UTF8 garbage, the the
1010+
-- result may not be what the user expected.
1011+
--
1012+
modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r
1013+
modUtf8_step !ip !len k (BufferRange op ope)
1014+
| op == ope = return $ bufferFull 1 op (modUtf8_step ip len k)
1015+
| otherwise = do
1016+
let !avail = ope `minusPtr` op
1017+
!usable = avail `min` len
1018+
-- null-termination makes it possible to read one more byte than the
1019+
-- nominal input length, with any unexpected 0xC000 ending interpreted
1020+
-- as a NUL. More typically, this simplifies hanlding of inputs where
1021+
-- 0xC0 0x80 might otherwise be split across the "usable" input window.
1022+
!ch <- peekElemOff ip (usable - 1)
1023+
let !use | ch /= 0xC0 = usable
1024+
| otherwise = usable + 1
1025+
!n <- utf8_copyBytes (ip `plusPtr` use) ip op
1026+
let !op' = op `plusPtr` n
1027+
!len' = len - use
1028+
ip' = ip `plusPtr` use
1029+
if | len' <= 0 -> k (BufferRange op' ope)
1030+
| op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope)
1031+
| otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k)
1032+
1033+
-- | Consume the supplied input returning the number of bytes written
1034+
utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int
1035+
utf8_copyBytes !ipe = \ ip op -> go 0 ip op
1036+
where
1037+
go !n !ip@((< ipe) -> True) !op = do
1038+
!ch <- peek ip
1039+
let !ip' = ip `plusPtr` 1
1040+
!op' = op `plusPtr` 1
1041+
if | ch /= 0xC0 -> do
1042+
poke op ch
1043+
let !cnt = ipe `minusPtr` ip'
1044+
!runend <- S.memchr ip' 0xC0 (fromIntegral cnt)
1045+
let !runlen | runend == nullPtr = cnt
1046+
| otherwise = runend `minusPtr` ip'
1047+
if (runlen == 0)
1048+
then go (n + 1) ip' op'
1049+
else do
1050+
copyBytes op' ip' runlen
1051+
go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen)
1052+
| otherwise -> do
1053+
!ch' <- peek ip'
1054+
poke op (ch' .&. 0x3f)
1055+
go (n + 1) (ip' `plusPtr` 1) op'
1056+
go !n _ _ = pure n
1057+
9521058
-- Short bytestrings
9531059
------------------------------------------------------------------------------
9541060

Data/ByteString/Builder/Prim.hs

Lines changed: 26 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
{-# LANGUAGE CPP #-}
12
{-# LANGUAGE Trustworthy #-}
23

4+
#include "bytestring-cpp-macros.h"
5+
36
{- | Copyright : (c) 2010-2011 Simon Meier
47
(c) 2010 Jasper van der Jeugt
58
License : BSD3-style (see LICENSE)
@@ -464,9 +467,7 @@ import Data.ByteString.Builder.Prim.ASCII
464467

465468
import Foreign
466469
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
467-
import GHC.Word (Word8 (..))
468470
import GHC.Exts
469-
import GHC.IO
470471

471472
------------------------------------------------------------------------------
472473
-- Creating Builders from bounded primitives
@@ -658,59 +659,36 @@ primMapLazyByteStringBounded w =
658659
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty
659660

660661

661-
------------------------------------------------------------------------------
662-
-- Raw CString encoding
663-
------------------------------------------------------------------------------
664-
665-
-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
666-
-- Null characters are not representable.
662+
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
663+
-- strings that are free of embedded null characters.
667664
--
668665
-- @since 0.11.0.0
669666
cstring :: Addr# -> Builder
670-
cstring =
671-
\addr0 -> builder $ step addr0
672-
where
673-
step :: Addr# -> BuildStep r -> BuildStep r
674-
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
675-
| W8# ch == 0 = k br
676-
| op0 == ope =
677-
return $ bufferFull 1 op0 (step addr k)
678-
| otherwise = do
679-
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
680-
s' -> (# s', () #)
681-
let br' = BufferRange (op0 `plusPtr` 1) ope
682-
step (addr `plusAddr#` 1#) k br'
683-
where
684-
!ch = indexWord8OffAddr# addr 0#
667+
cstring s = asciiLiteralCopy (Ptr s) (byteCountLiteral s)
668+
{-# INLINE cstring #-}
685669

686-
-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
687-
-- Null characters can be encoded as @0xc0 0x80@.
670+
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
671+
-- encoded strings in which any emebded null characters are represented via
672+
-- the two-byte overlong-encoding: @0xC0 0x80@.
688673
--
689674
-- @since 0.11.0.0
690675
cstringUtf8 :: Addr# -> Builder
691-
cstringUtf8 =
692-
\addr0 -> builder $ step addr0
693-
where
694-
step :: Addr# -> BuildStep r -> BuildStep r
695-
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
696-
| W8# ch == 0 = k br
697-
| op0 == ope =
698-
return $ bufferFull 1 op0 (step addr k)
699-
-- NULL is encoded as 0xc0 0x80
700-
| W8# ch == 0xc0
701-
, W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
702-
let !(W8# nullByte#) = 0
703-
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
704-
s' -> (# s', () #)
705-
let br' = BufferRange (op0 `plusPtr` 1) ope
706-
step (addr `plusAddr#` 2#) k br'
707-
| otherwise = do
708-
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
709-
s' -> (# s', () #)
710-
let br' = BufferRange (op0 `plusPtr` 1) ope
711-
step (addr `plusAddr#` 1#) k br'
712-
where
713-
!ch = indexWord8OffAddr# addr 0#
676+
cstringUtf8 s = modUtf8LitCopy (Ptr s) (byteCountLiteral s)
677+
{-# INLINE cstringUtf8 #-}
678+
679+
-- | Byte count of null-terminated primitive literal string excluding the
680+
-- terminating null byte.
681+
byteCountLiteral :: Addr# -> Int
682+
byteCountLiteral addr# =
683+
#if HS_cstringLength_AND_FinalPtr_AVAILABLE
684+
I# (cstringLength# addr#)
685+
#else
686+
fromIntegral (pure_strlen (Ptr addr#))
687+
688+
foreign import ccall unsafe "string.h strlen" pure_strlen
689+
:: CString -> CSize
690+
#endif
691+
{-# INLINE byteCountLiteral #-}
714692

715693
------------------------------------------------------------------------------
716694
-- Char8 encoding

bench/BenchAll.hs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,12 @@ main = do
327327
, benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#)
328328
, benchB' "ASCII String (64B, naive)" asciiStr fromString
329329
, benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf
330+
, benchB'_ "AsciiLit" $ asciiLit asciiBuf
331+
, benchB'_ "Utf8Lit" $ utf8Lit utf8Buf
332+
, benchB'_ "strLit" $ string8 asciiStr
333+
, benchB'_ "stringUtf8" $ stringUtf8 utf8Str
334+
, benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
335+
, benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
330336
]
331337

332338
, bgroup "Encoding wrappers"

tests/builder/Data/ByteString/Builder/Prim/Tests.hs

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import Data.Char (ord)
1414
import qualified Data.ByteString.Lazy as L
1515
import qualified Data.ByteString.Lazy.Char8 as LC
1616
import Data.ByteString.Builder
17+
import Data.ByteString.Builder.Extra as BE
1718
import qualified Data.ByteString.Builder.Prim as BP
1819
import Data.ByteString.Builder.Prim.TestUtils
1920

@@ -22,17 +23,28 @@ import Test.Tasty.QuickCheck
2223

2324
tests :: [TestTree]
2425
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
25-
, testsCombinatorsB, [testCString, testCStringUtf8] ]
26+
, testsCombinatorsB
27+
, [ testCString
28+
, testCStringUtf8 1
29+
, testCStringUtf8 6
30+
, testCStringUtf8 64
31+
]
32+
]
2633

2734
testCString :: TestTree
2835
testCString = testProperty "cstring" $
2936
toLazyByteString (BP.cstring "hello world!"#) ==
3037
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"
3138

32-
testCStringUtf8 :: TestTree
33-
testCStringUtf8 = testProperty "cstringUtf8" $
34-
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
35-
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
39+
testCStringUtf8 :: Int -> TestTree
40+
testCStringUtf8 sz = testProperty "cstringUtf8" $
41+
BE.toLazyByteStringWith (BE.untrimmedStrategy sz sz) L.empty
42+
(BP.cstringUtf8 "hello\xc0\x80\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\xc0\x80\xC0"#) ==
43+
LC.pack "hello" `L.append` L.singleton 0x00
44+
`L.append` L.singleton 0x00
45+
`L.append` LC.pack "\xd0\xbc\xd0\xb8\xd1\x80"
46+
`L.append` L.singleton 0x00
47+
`L.append` L.singleton 0x00
3648

3749
------------------------------------------------------------------------------
3850
-- Binary

0 commit comments

Comments
 (0)