Skip to content

Commit

Permalink
Data.ByteString.Lazy.dropEnd: Use two-pointers technique (#629)
Browse files Browse the repository at this point in the history
* Data.ByteString.Lazy.dropEnd: Use two-pointers technique

This can be seen as using the input itself as an implicit
queue; we formerly copied its chunks into an explicit queue.

By writing the key logic as a polymorphic `splitAtEndFold`,
it was easy to re-use it for `takeEnd`; the latter function
should now operate in constant stack space and can release
initial chunks of a very long input string sooner.

* Fix a very silly bug, and strengthen tests

...so that a plain 'cabal test' finds the bug almost every try
instead of finding it only every few dozen tries

* Actually work around the poison instance

(Some re-compilation check somewhere set a trap for me.)

This also replaces fromIntegral with intToIndexTy in a few places.

* Rewrite the poison instance using TypeError

* Rename "bsL" -> "toSplit" and "bsR" -> "toScan"

* Add basic benchmarks for lazy takeEnd/splitEnd

According to these benchmarks, the new implementation for takeEnd
is somewhat faster and the new implementation for dropEnd is roughly
3.5x to 4x as quick as its predecessor.

(cherry picked from commit 2bbc97e)
  • Loading branch information
clyring committed Feb 5, 2024
1 parent 62d1835 commit ed7a9d4
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 133 deletions.
129 changes: 80 additions & 49 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
{-# OPTIONS_HADDOCK prune #-}
{-# LANGUAGE Trustworthy #-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module : Data.ByteString.Lazy
-- Copyright : (c) Don Stewart 2006
Expand Down Expand Up @@ -237,9 +240,9 @@ import qualified Data.ByteString as P (ByteString) -- type name only
import qualified Data.ByteString as S -- S for strict (hmm...)
import qualified Data.ByteString.Internal.Type as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.ByteString.Lazy.Internal.Deque as D
import Data.ByteString.Lazy.Internal

import Control.Exception (assert)
import Control.Monad (mplus)
import Data.Word (Word8)
import Data.Int (Int64)
Expand Down Expand Up @@ -790,15 +793,75 @@ take i cs0 = take' i cs0
--
-- @since 0.11.2.0
takeEnd :: Int64 -> ByteString -> ByteString
takeEnd i _ | i <= 0 = Empty
takeEnd i cs0 = takeEnd' i cs0
where takeEnd' 0 _ = Empty
takeEnd' n cs =
snd $ foldrChunks takeTuple (n,Empty) cs
takeTuple _ (0, cs) = (0, cs)
takeTuple c (n, cs)
| n > fromIntegral (S.length c) = (n - fromIntegral (S.length c), Chunk c cs)
| otherwise = (0, Chunk (S.takeEnd (fromIntegral n) c) cs)
takeEnd i bs
| i <= 0 = Empty
| otherwise = splitAtEndFold (\_ res -> res) id i bs

-- | Helper function for implementing 'takeEnd' and 'dropEnd'
splitAtEndFold
:: forall result
. (S.StrictByteString -> result -> result)
-- ^ What to do when one chunk of output is ready
-- (The StrictByteString will not be empty.)
-> (ByteString -> result)
-- ^ What to do when the split-point is reached
-> Int64
-- ^ Number of bytes to leave at the end (must be strictly positive)
-> ByteString -- ^ Input ByteString
-> result
{-# INLINE splitAtEndFold #-}
splitAtEndFold step end len bs0 = assert (len > 0) $ case bs0 of
Empty -> end Empty
Chunk c t -> goR len c t t
where
-- Idea: Keep two references into the input ByteString:
-- "toSplit" tracks the current split point,
-- "toScan" tracks the yet-unprocessed tail.
-- When they are closer than "len" bytes apart, process more input. ("goR")
-- When they are at least "len" bytes apart, produce more output. ("goL")
-- We always have that "toScan" is a suffix of "toSplit",
-- and "toSplit" is a suffix of the original input (bs0).
goR :: Int64 -> S.StrictByteString -> ByteString -> ByteString -> result
goR !undershoot nextOutput@(S.BS noFp noLen) toSplit toScan =
assert (undershoot > 0) $
-- INVARIANT: length toSplit == length toScan + len - undershoot
-- (not 'assert'ed because that would break our laziness properties)
case toScan of
Empty
| undershoot >= intToInt64 noLen
-> end (Chunk nextOutput toSplit)
| undershootW <- fromIntegral @Int64 @Int undershoot
-- conversion Int64->Int is OK because 0 < undershoot < noLen
, splitIndex <- noLen - undershootW
, beforeSplit <- S.BS noFp splitIndex
, afterSplit <- S.BS (noFp `S.plusForeignPtr` splitIndex) undershootW
-> step beforeSplit $ end (Chunk afterSplit toSplit)

Chunk (S.BS _ cLen) newBsR
| cLen64 <- intToInt64 cLen
, undershoot > cLen64
-> goR (undershoot - cLen64) nextOutput toSplit newBsR
| undershootW <- fromIntegral @Int64 @Int undershoot
-> step nextOutput $ goL (cLen - undershootW) toSplit newBsR

goL :: Int -> ByteString -> ByteString -> result
goL !overshoot toSplit toScan =
assert (overshoot >= 0) $
-- INVARIANT: length toSplit == length toScan + len + intToInt64 overshoot
-- (not 'assert'ed because that would break our laziness properties)
case toSplit of
Empty -> splitAtEndFoldInvariantFailed
Chunk c@(S.BS _ cLen) newBsL
| overshoot >= cLen
-> step c $ goL (overshoot - cLen) newBsL toScan
| otherwise
-> goR (intToInt64 $ cLen - overshoot) c newBsL toScan

splitAtEndFoldInvariantFailed :: a
-- See Note [Float error calls out of INLINABLE things] in D.B.Internal.Type
splitAtEndFoldInvariantFailed =
moduleError "splitAtEndFold"
"internal error: toSplit not longer than toScan"

-- | /O(n\/c)/ 'drop' @n xs@ returns the suffix of @xs@ after the first @n@
-- elements, or 'empty' if @n > 'length' xs@.
Expand All @@ -824,44 +887,9 @@ drop i cs0 = drop' i cs0
--
-- @since 0.11.2.0
dropEnd :: Int64 -> ByteString -> ByteString
dropEnd i p | i <= 0 = p
dropEnd i p = go D.empty p
where go :: D.Deque -> ByteString -> ByteString
go deque (Chunk c cs)
| D.byteLength deque < i = go (D.snoc c deque) cs
| otherwise =
let (output, deque') = getOutput empty (D.snoc c deque)
in foldrChunks Chunk (go deque' cs) output
go deque Empty = fromDeque $ dropEndBytes deque i

len c = fromIntegral (S.length c)

-- get a `ByteString` from all the front chunks of the accumulating deque
-- for which we know they won't be dropped
getOutput :: ByteString -> D.Deque -> (ByteString, D.Deque)
getOutput out deque = case D.popFront deque of
Nothing -> (reverseChunks out, deque)
Just (x, deque') | D.byteLength deque' >= i ->
getOutput (Chunk x out) deque'
_ -> (reverseChunks out, deque)

-- reverse a `ByteString`s chunks, keeping all internal `S.StrictByteString`s
-- unchanged
reverseChunks = foldlChunks (flip Chunk) empty

-- drop n elements from the rear of the accumulating `deque`
dropEndBytes :: D.Deque -> Int64 -> D.Deque
dropEndBytes deque n = case D.popRear deque of
Nothing -> deque
Just (deque', x) | len x <= n -> dropEndBytes deque' (n - len x)
| otherwise ->
D.snoc (S.dropEnd (fromIntegral n) x) deque'

-- build a lazy ByteString from an accumulating `deque`
fromDeque :: D.Deque -> ByteString
fromDeque deque =
List.foldr chunk Empty (D.front deque) `append`
List.foldl' (flip chunk) Empty (D.rear deque)
dropEnd i p
| i <= 0 = p
| otherwise = splitAtEndFold Chunk (const Empty) i p

-- | /O(n\/c)/ 'splitAt' @n xs@ is equivalent to @('take' n xs, 'drop' n xs)@.
splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
Expand Down Expand Up @@ -1688,6 +1716,9 @@ revNonEmptyChunks = List.foldl' (flip Chunk) Empty
revChunks :: [P.ByteString] -> ByteString
revChunks = List.foldl' (flip chunk) Empty

intToInt64 :: Int -> Int64
intToInt64 = fromIntegral @Int @Int64

-- $IOChunk
--
-- ⚠ Using lazy I\/O functions like 'readFile' or 'hGetContents'
Expand Down
65 changes: 0 additions & 65 deletions Data/ByteString/Lazy/Internal/Deque.hs

This file was deleted.

24 changes: 21 additions & 3 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import Data.Semigroup
import Data.String
import Test.Tasty.Bench
import Prelude hiding (words)
import qualified Data.List as List
import Control.DeepSeq

import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
Expand Down Expand Up @@ -99,9 +101,12 @@ lazyByteStringData = case S.splitAt (nRepl `div` 2) byteStringData of

{-# NOINLINE smallChunksData #-}
smallChunksData :: L.ByteString
smallChunksData
= L.fromChunks [S.take sz (S.drop n byteStringData)
| let sz = 48, n <- [0, sz .. S.length byteStringData]]
smallChunksData = L.fromChunks $ List.unfoldr step (byteStringData, 1)
where
step (!s, !i)
| S.null s = Nothing
| otherwise = case S.splitAt i s of
(!s1, !s2) -> Just (s1, (s2, i * 71 `mod` 97))

{-# NOINLINE byteStringChunksData #-}
byteStringChunksData :: [S.ByteString]
Expand Down Expand Up @@ -419,6 +424,19 @@ main = do
[ bench "strict" $ nf S.tails byteStringData
, bench "lazy" $ nf L.tails lazyByteStringData
]
, bgroup "splitAtEnd (lazy)" $ let
testSAE op = \bs -> [op i bs | i <- [0,5..L.length bs]] `deepseq` ()
{-# INLINE testSAE #-}
in
[ bench "takeEnd" $
nf (testSAE L.takeEnd) lazyByteStringData
, bench "takeEnd (small chunks)" $
nf (testSAE L.takeEnd) smallChunksData
, bench "dropEnd" $
nf (testSAE L.dropEnd) lazyByteStringData
, bench "dropEnd (small chunks)" $
nf (testSAE L.dropEnd) smallChunksData
]
, bgroup "sort" $ map (\s -> bench (S8.unpack s) $ nf S.sort s) sortInputs
, bgroup "stimes" $ let st = stimes :: Int -> S.ByteString -> S.ByteString
in
Expand Down
1 change: 0 additions & 1 deletion bytestring.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ library
Data.ByteString.Builder.RealFloat.Internal
Data.ByteString.Builder.RealFloat.TableGenerator
Data.ByteString.Internal.Type
Data.ByteString.Lazy.Internal.Deque
Data.ByteString.Lazy.ReadInt
Data.ByteString.Lazy.ReadNat
Data.ByteString.ReadInt
Expand Down
38 changes: 26 additions & 12 deletions tests/Properties/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ import Test.Tasty
import Test.Tasty.QuickCheck
import QuickCheckUtils

#ifdef BYTESTRING_LAZY
import Data.Int
#endif

#ifndef BYTESTRING_CHAR8
toElem :: Word8 -> Word8
toElem = id
Expand Down Expand Up @@ -114,16 +118,25 @@ instance Arbitrary Natural where

testRdInt :: forall a. (Arbitrary a, RdInt a) => String -> TestTree
testRdInt s = testGroup s $
[ testProperty "from string" $ \ prefix value suffix ->
[ testProperty "from string" $ int64OK $ \value prefix suffix ->
let si = show @a value
b = prefix <> B.pack si <> suffix
in fmap (second B.unpack) (bread @a b)
=== sread @a (B.unpack prefix ++ si ++ B.unpack suffix)
, testProperty "from number" $ \n ->
, testProperty "from number" $ int64OK $ \n ->
bread @a (B.pack (show n)) === Just (n, B.empty)
]
#endif

intToIndexTy :: Int -> IndexTy
#ifdef BYTESTRING_LAZY
type IndexTy = Int64
intToIndexTy = fromIntegral @Int @Int64
#else
type IndexTy = Int
intToIndexTy = id
#endif

tests :: [TestTree]
tests =
[ testProperty "pack . unpack" $
Expand Down Expand Up @@ -308,7 +321,7 @@ tests =
#endif

, testProperty "drop" $
\n x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 10" $
\x -> let n = 10 in B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 2^31" $
Expand All @@ -325,7 +338,7 @@ tests =
#endif

, testProperty "take" $
\n x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 10" $
\x -> let n = 10 in B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 2^31" $
Expand All @@ -342,11 +355,11 @@ tests =
#endif

, testProperty "dropEnd" $
\n x -> B.dropEnd n x === B.take (B.length x - n) x
\(intToIndexTy -> n) x -> B.dropEnd n x === B.take (B.length x - n) x
, testProperty "dropWhileEnd" $
\f x -> B.dropWhileEnd f x === B.reverse (B.dropWhile f (B.reverse x))
, testProperty "takeEnd" $
\n x -> B.takeEnd n x === B.drop (B.length x - n) x
\(intToIndexTy -> n) x -> B.takeEnd n x === B.drop (B.length x - n) x
, testProperty "takeWhileEnd" $
\f x -> B.takeWhileEnd f x === B.reverse (B.takeWhile f (B.reverse x))

Expand All @@ -366,7 +379,7 @@ tests =
, testProperty "compareLength 4" $
\x (toElem -> c) -> B.compareLength (B.snoc x c <> undefined) (B.length x) === GT
, testProperty "compareLength 5" $
\x n -> B.compareLength x n === compare (B.length x) n
\x (intToIndexTy -> n) -> B.compareLength x n === compare (B.length x) n
, testProperty "dropEnd lazy" $
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c
, testProperty "dropWhileEnd lazy" $
Expand Down Expand Up @@ -470,7 +483,8 @@ tests =
(l1 == l2 || l1 == l2 + 1) && sum (map B.length splits) + l2 == B.length x

, testProperty "splitAt" $
\n x -> (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
\(intToIndexTy -> n) x -> (B.unpack *** B.unpack) (B.splitAt n x)
=== List.genericSplitAt n (B.unpack x)
, testProperty "splitAt 10" $
\x -> let n = 10 in (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
, testProperty "splitAt (2^31)" $
Expand Down Expand Up @@ -594,13 +608,13 @@ tests =
#endif

, testProperty "index" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.index x (fromIntegral n) === B.unpack x !! n
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.index x (intToIndexTy n) === B.unpack x !! n
, testProperty "indexMaybe" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.indexMaybe x (fromIntegral n) === Just (B.unpack x !! n)
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.indexMaybe x (intToIndexTy n) === Just (B.unpack x !! n)
, testProperty "indexMaybe Nothing" $
\n x -> (n :: Int) < 0 || fromIntegral n >= B.length x ==> B.indexMaybe x (fromIntegral n) === Nothing
\n x -> n < 0 || intToIndexTy n >= B.length x ==> B.indexMaybe x (intToIndexTy n) === Nothing
, testProperty "!?" $
\n x -> B.indexMaybe x (fromIntegral (n :: Int)) === x B.!? (fromIntegral n)
\(intToIndexTy -> n) x -> B.indexMaybe x n === x B.!? n

#ifdef BYTESTRING_CHAR8
, testProperty "isString" $
Expand Down
Loading

0 comments on commit ed7a9d4

Please sign in to comment.