Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ jobs:
apt-get install -y ghc libghc-tasty-quickcheck-dev libghc-syb-dev
run: |
ghc --version
ghc --make -fPIC -XHaskell2010 -XBangPatterns -XDeriveDataTypeable -XDeriveGeneric -XDeriveLift -XFlexibleContexts -XFlexibleInstances -XLambdaCase -XMagicHash -XMultiWayIf -XNamedFieldPuns -XPatternSynonyms -XRankNTypes -XScopedTypeVariables -XStandaloneDeriving -XTupleSections -XTypeApplications -XTypeOperators -XUnboxedTuples -optP-Wall -optP-Werror=undef -DPURE_HASKELL=0 -Iinclude -itests:tests/builder -o Main cbits/*.c tests/Main.hs +RTS -s
ghc --make -fPIC -XHaskell2010 -XBangPatterns -XDeriveDataTypeable -XDeriveGeneric -XDeriveLift -XFlexibleContexts -XFlexibleInstances -XLambdaCase -XMagicHash -XMultiWayIf -XNamedFieldPuns -XPatternSynonyms -XRankNTypes -XScopedTypeVariables -XStandaloneDeriving -XTupleSections -XTypeApplications -XTypeOperators -XUnboxedTuples -optP-Wall -optP-Werror=undef -DPURE_HASKELL=0 -DBYTESTRING_PLUGIN_TESTS=0 -Iinclude -itests:tests/builder -o Main cbits/*.c tests/Main.hs +RTS -s
./Main +RTS -s

bounds-checking:
Expand All @@ -158,7 +158,7 @@ jobs:
path: |
${{ steps.setup-haskell-cabal.outputs.cabal-store }}
dist-newstyle
key: ${{ runner.os }}-latest
key: ${{ runner.os }}-latest-bounds-checking
- name: Test
run: cabal test --ghc-options='-fcheck-prim-bounds -fno-ignore-asserts -DHS_BYTESTRING_ASSERTIONS'

Expand All @@ -183,6 +183,27 @@ jobs:
- name: Test
run: cabal test -fpure-haskell --ghc-options=-fno-ignore-asserts --enable-tests --test-show-details=direct all

inspection-testing:
needs: build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: haskell-actions/setup@v2
id: setup-haskell-cabal
with:
ghc-version: 'latest'
- name: Update cabal package database
run: cabal update
- uses: actions/cache@v3
name: Cache cabal stuff
with:
path: |
${{ steps.setup-haskell-cabal.outputs.cabal-store }}
dist-newstyle
key: ${{ runner.os }}-latest-inspection-testing
- name: Test
run: sh run-plugin-tests.sh

i386:
needs: build
runs-on: ubuntu-latest
Expand Down
107 changes: 107 additions & 0 deletions Data/ByteString/Builder/Internal.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
{-# LANGUAGE Unsafe #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE NoMonoLocalBinds #-}
{-# LANGUAGE ViewPatterns #-}

{-# OPTIONS_HADDOCK not-home #-}

Expand Down Expand Up @@ -87,6 +89,8 @@ module Data.ByteString.Builder.Internal (
-- , sizedChunksInsert

, byteStringCopy
, asciiLiteralCopy
, modUtf8LitCopy
, byteStringInsert
, byteStringThreshold

Expand Down Expand Up @@ -816,6 +820,7 @@ ensureFree :: Int -> Builder
ensureFree minFree =
builder step
where
step :: forall r. BuildStep r -> BuildStep r
step k br@(BufferRange op ope)
| ope `minusPtr` op < minFree = return $ bufferFull minFree op k
| otherwise = k br
Expand All @@ -839,6 +844,25 @@ wrappedBytesCopyStep bs0 k =
where
outRemaining = ope `minusPtr` op

-- | Copy the bytes from a 'BufferRange' into the output stream.
wrappedBufferRangeCopyStep :: BufferRange -- ^ Input 'BufferRange'.
-> BuildStep a -> BuildStep a
wrappedBufferRangeCopyStep (BufferRange ip0 ipe) k =
go ip0
where
go !ip (BufferRange op ope)
| inpRemaining <= outRemaining = do
copyBytes op ip inpRemaining
let !br' = BufferRange (op `plusPtr` inpRemaining) ope
k br'
| otherwise = do
copyBytes op ip outRemaining
let !ip' = ip `plusPtr` outRemaining
return $ bufferFull 1 ope (go ip')
where
outRemaining = ope `minusPtr` op
inpRemaining = ipe `minusPtr` ip


-- Strict ByteStrings
------------------------------------------------------------------------------
Expand All @@ -858,6 +882,7 @@ byteStringThreshold :: Int -> S.StrictByteString -> Builder
byteStringThreshold maxCopySize =
\bs -> builder $ step bs
where
step :: forall r. S.ByteString -> BuildStep r -> BuildStep r
step bs@(S.BS _ len) k br@(BufferRange !op _)
| len <= maxCopySize = byteStringCopyStep bs k br
| otherwise = return $ insertChunk op bs k
Expand Down Expand Up @@ -949,6 +974,88 @@ byteStringInsert :: S.StrictByteString -> Builder
byteStringInsert =
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | Builder for raw pointers to static data of known length that will never be
-- moved or freed. (This is used with the static buffers GHC uses to implement
-- ASCII string literals that do not contain null characters.)
--
-- @since 0.13.0.0
{-# INLINABLE asciiLiteralCopy #-}
asciiLiteralCopy :: Ptr Word8 -> Int -> Builder
asciiLiteralCopy = \ !ip !len -> builder $ \k br@(BufferRange op ope) ->
if len <= ope `minusPtr` op
then copyBytes op ip len >> k (BufferRange (op `plusPtr` len) ope)
else wrappedBufferRangeCopyStep (BufferRange ip (ip `plusPtr` len)) k br

-- | Builder for pointers to /null-terminated/ primitive UTF-8 encoded strings
-- that may contain embedded overlong two-byte encodings of the NUL character
-- as @0xC0 0x80@. Other deviations from strict UTF-8 are tolerated, but the
-- result is not well defined.
--
-- @since 0.13.0.0
{-# INLINABLE modUtf8LitCopy #-}
modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
modUtf8LitCopy !ip !len
| len > 0 = builder (modUtf8_step ip len)
| otherwise = builder id

-- | Copy a /non-empty/ UTF-8 input possibly containing denormalised 2-octet
-- sequences. While only the NUL byte should ever encoded that way (as @0xC0
-- 80@), this handles other denormalised @0xC0 0x??@ sequences by keeping the
-- bottom 6 bits of the second byte. If the input is non-UTF8 garbage, the the
-- result may not be what the user expected.
--
modUtf8_step :: Ptr Word8 -> Int -> BuildStep r -> BuildStep r
modUtf8_step !ip !len k (BufferRange op ope)
| op == ope = return $ bufferFull 1 op (modUtf8_step ip len k)
| otherwise = do
let !avail = ope `minusPtr` op
!usable = avail `min` len
-- null-termination makes it possible to read one more byte than the
-- nominal input length, with any unexpected 0xC000 ending interpreted
-- as a NUL. More typically, this simplifies hanlding of inputs where
-- 0xC0 0x80 might otherwise be split across the "usable" input window.
!ch <- peekElemOff ip (usable - 1)
let !use | ch /= 0xC0 = usable
| otherwise = usable + 1
!n <- utf8_copyBytes (ip `plusPtr` use) ip op
let !op' = op `plusPtr` n
!len' = len - use
ip' = ip `plusPtr` use
if | len' <= 0 -> k (BufferRange op' ope)
| op' < ope -> modUtf8_step ip' len' k (BufferRange op' ope)
| otherwise -> return $ bufferFull 1 op' (modUtf8_step ip' len' k)

-- | Consume the supplied input returning the number of bytes written
utf8_copyBytes :: Ptr Word8 -> Ptr Word8 -> Ptr Word8 -> IO Int
utf8_copyBytes !ipe = \ ip op -> go 0 ip op
where
go :: Int -> Ptr Word8 -> Ptr Word8 -> IO Int
go !n !ip@((< ipe) -> True) !op = do
!ch <- peek ip
let !ip' = ip `plusPtr` 1
!op' = op `plusPtr` 1
if | ch /= 0xC0 -> do
poke op ch
let !cnt = ipe `minusPtr` ip'
!runend <- S.memchr ip' 0xC0 (fromIntegral @Int cnt)
let !runlen | runend == nullPtr = cnt
| otherwise = runend `minusPtr` ip'
if (runlen == 0)
then go (n + 1) ip' op'
else do
copyBytes op' ip' runlen
go (n + 1 + runlen) (ip' `plusPtr` runlen) (op' `plusPtr` runlen)
| otherwise -> do
!ch' <- peek ip'
poke op (ch' .&. 0x3f)
go (n + 1) (ip' `plusPtr` 1) op'
go !n _ _ = pure n

-- Short bytestrings
------------------------------------------------------------------------------

Expand Down
58 changes: 10 additions & 48 deletions Data/ByteString/Builder/Prim.hs
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ import Data.ByteString.Builder.Internal

import qualified Data.ByteString as S
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Internal.Type as S
import qualified Data.ByteString.Lazy.Internal as L

import Data.Char (ord)
Expand All @@ -464,9 +465,7 @@ import Data.ByteString.Builder.Prim.ASCII

import Foreign
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import GHC.Word (Word8 (..))
import GHC.Exts
import GHC.IO

------------------------------------------------------------------------------
-- Creating Builders from bounded primitives
Expand Down Expand Up @@ -658,59 +657,22 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
-- Null characters are not representable.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
-- strings that are free of embedded null characters.
--
-- @since 0.11.0.0
cstring :: Addr# -> Builder
cstring =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstring #-}

-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
-- Null characters can be encoded as @0xc0 0x80@.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
-- encoded strings in which any emebded null characters are represented via
-- the two-byte overlong-encoding: @0xC0 0x80@.
--
-- @since 0.11.0.0
cstringUtf8 :: Addr# -> Builder
cstringUtf8 =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
-- NULL is encoded as 0xc0 0x80
| W8# ch == 0xc0
, W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
let !(W8# nullByte#) = 0
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 2#) k br'
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstringUtf8 #-}

------------------------------------------------------------------------------
-- Char8 encoding
Expand Down
13 changes: 13 additions & 0 deletions Data/ByteString/Internal/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ module Data.ByteString.Internal.Type (
unsafePackAddress, unsafePackLenAddress,
unsafePackLiteral, unsafePackLenLiteral,
literalFromOctetString, literalFromHex,
byteCountLiteral,

-- * Low level imperative construction
empty,
Expand Down Expand Up @@ -486,6 +487,18 @@ unsafePackLenAddress len addr# = do
#endif
{-# INLINE unsafePackLenAddress #-}

-- | Byte count of null-terminated primitive literal string excluding the
-- terminating null byte.
byteCountLiteral :: Addr# -> Int
byteCountLiteral addr# =
#if HS_cstringLength_AND_FinalPtr_AVAILABLE
I# (cstringLength# addr#)
#else
fromIntegral @CSize @Int $
accursedUnutterablePerformIO (c_strlen (Ptr addr#))
#endif
{-# INLINE byteCountLiteral #-}

-- | See 'unsafePackAddress'. This function has similar behavior. Prefer
-- this function when the address in known to be an @Addr#@ literal. In
-- that context, there is no need for the sequencing guarantees that 'IO'
Expand Down
4 changes: 4 additions & 0 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ main = do
, benchB'_ "ASCII String (12B)" $ asciiLit (Ptr "hello wurld!"#)
, benchB' "ASCII String (64B, naive)" asciiStr fromString
, benchB'_ "ASCII String (64B)" $ asciiLit asciiBuf
, benchB'_ "strLit" $ string8 asciiStr
, benchB'_ "stringUtf8" $ stringUtf8 utf8Str
, benchB'_ "strLitInline" $ string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
, benchB'_ "utf8LitInline" $ stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
]

, bgroup "Encoding wrappers"
Expand Down
19 changes: 17 additions & 2 deletions bytestring.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,10 @@ test-suite bytestring-tests
QuickCheckUtils
hs-source-dirs: tests,
tests/builder
build-depends: bytestring
-- Keep 'bytestring' on the same line as 'build-depends:'
-- this is used by our hack to allow plugin-based tests
build-depends: base,
bytestring,
deepseq,
QuickCheck,
tasty,
Expand All @@ -226,6 +228,17 @@ test-suite bytestring-tests
transformers >= 0.3,
syb

-- The following intentionally-funnily-spelled condition
-- is changed to 'true' by our hack to allow plugin-based tests
if false && impl(pluginTestsHack)
cpp-options: -DBYTESTRING_PLUGIN_TESTS=1
build-depends: tasty-inspection-testing ^>= 0.2.1,
tasty-expected-failure ^>= 0.12.3
other-modules: PluginTests
PluginTests.Splices
else
cpp-options: -DBYTESTRING_PLUGIN_TESTS=0

ghc-options: -fwarn-unused-binds
-rtsopts
if !arch(wasm32)
Expand All @@ -249,8 +262,10 @@ benchmark bytestring-bench

ghc-options: -O2 "-with-rtsopts=-A32m"
-fproc-alignment=64
build-depends: bytestring
-- Keep 'bytestring' on the same line as 'build-depends:'
-- this is used by our hack to allow plugin-based tests
build-depends: base,
bytestring,
deepseq,
tasty-bench,
random
14 changes: 14 additions & 0 deletions run-plugin-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# small script to hackily work around the dependency cycle
# 'bytestring -> [plugin] -> ghc -> bytestring' that prevents
# the testsuite from using plugins, by renaming the library
# to 'bytestring-plugins-hack'

sed -E '
/Name:|build-depends:/s/bytestring/bytestring-plugins-hack/ ;
s/if false && impl\(pluginTestsHack\)/if true/' \
bytestring.cabal > bytestring-plugins-hack.cabal

mv bytestring.cabal bytestring.cabal.__MOVED_DURING_PLUGIN_TESTS__
cabal test --test-show-details=direct "$@"
mv bytestring.cabal.__MOVED_DURING_PLUGIN_TESTS__ bytestring.cabal
rm bytestring-plugins-hack.cabal
Loading