Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data.ByteString.Lazy.dropEnd: Use two-pointers technique #629

Merged
merged 8 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 78 additions & 49 deletions Data/ByteString/Lazy.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
{-# OPTIONS_HADDOCK prune #-}
{-# LANGUAGE Trustworthy #-}

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module : Data.ByteString.Lazy
-- Copyright : (c) Don Stewart 2006
Expand Down Expand Up @@ -237,9 +240,9 @@ import qualified Data.ByteString as P (ByteString) -- type name only
import qualified Data.ByteString as S -- S for strict (hmm...)
import qualified Data.ByteString.Internal.Type as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.ByteString.Lazy.Internal.Deque as D
import Data.ByteString.Lazy.Internal

import Control.Exception (assert)
import Control.Monad (mplus)
import Data.Word (Word8)
import Data.Int (Int64)
Expand Down Expand Up @@ -790,15 +793,73 @@ take i cs0 = take' i cs0
--
-- @since 0.11.2.0
takeEnd :: Int64 -> ByteString -> ByteString
takeEnd i _ | i <= 0 = Empty
takeEnd i cs0 = takeEnd' i cs0
where takeEnd' 0 _ = Empty
takeEnd' n cs =
snd $ foldrChunks takeTuple (n,Empty) cs
takeTuple _ (0, cs) = (0, cs)
takeTuple c (n, cs)
| n > fromIntegral (S.length c) = (n - fromIntegral (S.length c), Chunk c cs)
| otherwise = (0, Chunk (S.takeEnd (fromIntegral n) c) cs)
takeEnd i bs
| i <= 0 = Empty
| otherwise = splitAtEndFold (\_ res -> res) id i bs

-- | Helper function for implementing 'takeEnd' and 'dropEnd'
splitAtEndFold
:: forall result
. (S.StrictByteString -> result -> result)
-- ^ What to do when one chunk of output is ready
-- (The StrictByteString will not be empty.)
-> (ByteString -> result)
-- ^ What to do when the split-point is reached
-> Int64
-- ^ Number of bytes to leave at the end (must be strictly positive)
-> ByteString -- ^ Input ByteString
-> result
{-# INLINE splitAtEndFold #-}
splitAtEndFold step end len bs0 = assert (len > 0) $ case bs0 of
Empty -> end Empty
Chunk c t -> goR len c t t
where
-- Idea: Keep two references into the input ByteString:
-- "bsL" tracks the current split point,
-- "bsR" tracks the yet-unprocessed tail.
-- When they are closer than "len" bytes apart, process more input. ("goR")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it true that bsR is a substring of bsL? Maybe calling them tortoise and hare as in Floyd's algorithm would be helpful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. bsR is always a suffix of bsL, which is always a suffix of the original input. I'm not sure I like the suggested tortoise/hare naming here but agree bsL/bsR isn't great, either.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bsL / bsR are quite misleading IMO: if I were to guess I'd say that they are "left bytestring" and "right bytestring" with regards to a certain split point such that bsL <> bsR equals the input string.

Even something very stupid like bs, subBs and subSubBs would be better.

-- When they are at least "len" bytes apart, produce more output. ("goL")
goR :: Int64 -> S.StrictByteString -> ByteString -> ByteString -> result
goR !undershoot nextOutput@(S.BS noFp noLen) bsL bsR =
assert (undershoot > 0) $
-- INVARIANT: length bsL == length bsR + len - undershoot
-- (not 'assert'ed because that would break our laziness properties)
case bsR of
Empty
| undershoot >= intToInt64 noLen
-> end (Chunk nextOutput bsL)
| undershootW <- fromIntegral @Int64 @Int undershoot
-- conversion Int64->Int is OK because 0 < undershoot < noLen
, splitIndex <- noLen - undershootW
, beforeSplit <- S.BS noFp splitIndex
, afterSplit <- S.BS (noFp `S.plusForeignPtr` splitIndex) undershootW
-> step beforeSplit $ end (Chunk afterSplit bsL)

Chunk (S.BS _ cLen) newBsR
| cLen64 <- intToInt64 cLen
, undershoot > cLen64
-> goR (undershoot - cLen64) nextOutput bsL newBsR
| undershootW <- fromIntegral @Int64 @Int undershoot
-> step nextOutput $ goL (cLen - undershootW) bsL newBsR

goL :: Int -> ByteString -> ByteString -> result
goL !overshoot bsL bsR =
assert (overshoot >= 0) $
-- INVARIANT: length bsL == length bsR + len + intToInt64 overshoot
-- (not 'assert'ed because that would break our laziness properties)
case bsL of
Empty -> splitAtEndFoldInvariantFailed
Chunk c@(S.BS _ cLen) newBsL
| overshoot >= cLen
-> step c $ goL (overshoot - cLen) newBsL bsR
| otherwise
-> goR (intToInt64 $ cLen - overshoot) c newBsL bsR

splitAtEndFoldInvariantFailed :: a
-- See Note [Float error calls out of INLINABLE things] in D.B.Internal.Type
splitAtEndFoldInvariantFailed =
moduleError "splitAtEndFold"
"internal error: bsL not longer than bsR"

-- | /O(n\/c)/ 'drop' @n xs@ returns the suffix of @xs@ after the first @n@
-- elements, or 'empty' if @n > 'length' xs@.
Expand All @@ -824,44 +885,9 @@ drop i cs0 = drop' i cs0
--
-- @since 0.11.2.0
dropEnd :: Int64 -> ByteString -> ByteString
dropEnd i p | i <= 0 = p
dropEnd i p = go D.empty p
where go :: D.Deque -> ByteString -> ByteString
go deque (Chunk c cs)
| D.byteLength deque < i = go (D.snoc c deque) cs
| otherwise =
let (output, deque') = getOutput empty (D.snoc c deque)
in foldrChunks Chunk (go deque' cs) output
go deque Empty = fromDeque $ dropEndBytes deque i

len c = fromIntegral (S.length c)

-- get a `ByteString` from all the front chunks of the accumulating deque
-- for which we know they won't be dropped
getOutput :: ByteString -> D.Deque -> (ByteString, D.Deque)
getOutput out deque = case D.popFront deque of
Nothing -> (reverseChunks out, deque)
Just (x, deque') | D.byteLength deque' >= i ->
getOutput (Chunk x out) deque'
_ -> (reverseChunks out, deque)

-- reverse a `ByteString`s chunks, keeping all internal `S.StrictByteString`s
-- unchanged
reverseChunks = foldlChunks (flip Chunk) empty

-- drop n elements from the rear of the accumulating `deque`
dropEndBytes :: D.Deque -> Int64 -> D.Deque
dropEndBytes deque n = case D.popRear deque of
Nothing -> deque
Just (deque', x) | len x <= n -> dropEndBytes deque' (n - len x)
| otherwise ->
D.snoc (S.dropEnd (fromIntegral n) x) deque'

-- build a lazy ByteString from an accumulating `deque`
fromDeque :: D.Deque -> ByteString
fromDeque deque =
List.foldr chunk Empty (D.front deque) `append`
List.foldl' (flip chunk) Empty (D.rear deque)
dropEnd i p
| i <= 0 = p
| otherwise = splitAtEndFold Chunk (const Empty) i p

-- | /O(n\/c)/ 'splitAt' @n xs@ is equivalent to @('take' n xs, 'drop' n xs)@.
splitAt :: Int64 -> ByteString -> (ByteString, ByteString)
Expand Down Expand Up @@ -1688,6 +1714,9 @@ revNonEmptyChunks = List.foldl' (flip Chunk) Empty
revChunks :: [P.ByteString] -> ByteString
revChunks = List.foldl' (flip chunk) Empty

intToInt64 :: Int -> Int64
intToInt64 = fromIntegral @Int @Int64

-- $IOChunk
--
-- ⚠ Using lazy I\/O functions like 'readFile' or 'hGetContents'
Expand Down
65 changes: 0 additions & 65 deletions Data/ByteString/Lazy/Internal/Deque.hs

This file was deleted.

1 change: 0 additions & 1 deletion bytestring.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ library
Data.ByteString.Builder.RealFloat.Internal
Data.ByteString.Builder.RealFloat.TableGenerator
Data.ByteString.Internal.Type
Data.ByteString.Lazy.Internal.Deque
Data.ByteString.Lazy.ReadInt
Data.ByteString.Lazy.ReadNat
Data.ByteString.ReadInt
Expand Down
38 changes: 26 additions & 12 deletions tests/Properties/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ import Test.Tasty
import Test.Tasty.QuickCheck
import QuickCheckUtils

#ifdef BYTESTRING_LAZY
import Data.Int
#endif

#ifndef BYTESTRING_CHAR8
toElem :: Word8 -> Word8
toElem = id
Expand Down Expand Up @@ -114,16 +118,25 @@ instance Arbitrary Natural where

testRdInt :: forall a. (Arbitrary a, RdInt a) => String -> TestTree
testRdInt s = testGroup s $
[ testProperty "from string" $ \ prefix value suffix ->
[ testProperty "from string" $ int64OK $ \value prefix suffix ->
let si = show @a value
b = prefix <> B.pack si <> suffix
in fmap (second B.unpack) (bread @a b)
=== sread @a (B.unpack prefix ++ si ++ B.unpack suffix)
, testProperty "from number" $ \n ->
, testProperty "from number" $ int64OK $ \n ->
bread @a (B.pack (show n)) === Just (n, B.empty)
]
#endif

intToIndexTy :: Int -> IndexTy
#ifdef BYTESTRING_LAZY
type IndexTy = Int64
intToIndexTy = fromIntegral @Int @Int64
#else
type IndexTy = Int
intToIndexTy = id
#endif

tests :: [TestTree]
tests =
[ testProperty "pack . unpack" $
Expand Down Expand Up @@ -308,7 +321,7 @@ tests =
#endif

, testProperty "drop" $
\n x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 10" $
\x -> let n = 10 in B.unpack (B.drop n x) === List.genericDrop n (B.unpack x)
, testProperty "drop 2^31" $
Expand All @@ -325,7 +338,7 @@ tests =
#endif

, testProperty "take" $
\n x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
\(intToIndexTy -> n) x -> B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 10" $
\x -> let n = 10 in B.unpack (B.take n x) === List.genericTake n (B.unpack x)
, testProperty "take 2^31" $
Expand All @@ -342,11 +355,11 @@ tests =
#endif

, testProperty "dropEnd" $
\n x -> B.dropEnd n x === B.take (B.length x - n) x
\(intToIndexTy -> n) x -> B.dropEnd n x === B.take (B.length x - n) x
, testProperty "dropWhileEnd" $
\f x -> B.dropWhileEnd f x === B.reverse (B.dropWhile f (B.reverse x))
, testProperty "takeEnd" $
\n x -> B.takeEnd n x === B.drop (B.length x - n) x
\(intToIndexTy -> n) x -> B.takeEnd n x === B.drop (B.length x - n) x
, testProperty "takeWhileEnd" $
\f x -> B.takeWhileEnd f x === B.reverse (B.takeWhile f (B.reverse x))

Expand All @@ -366,7 +379,7 @@ tests =
, testProperty "compareLength 4" $
\x (toElem -> c) -> B.compareLength (B.snoc x c <> undefined) (B.length x) === GT
, testProperty "compareLength 5" $
\x n -> B.compareLength x n === compare (B.length x) n
\x (intToIndexTy -> n) -> B.compareLength x n === compare (B.length x) n
, testProperty "dropEnd lazy" $
\(toElem -> c) -> B.take 1 (B.dropEnd 1 (B.singleton c <> B.singleton c <> B.singleton c <> undefined)) === B.singleton c
, testProperty "dropWhileEnd lazy" $
Expand Down Expand Up @@ -470,7 +483,8 @@ tests =
(l1 == l2 || l1 == l2 + 1) && sum (map B.length splits) + l2 == B.length x

, testProperty "splitAt" $
\n x -> (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
\(intToIndexTy -> n) x -> (B.unpack *** B.unpack) (B.splitAt n x)
=== List.genericSplitAt n (B.unpack x)
, testProperty "splitAt 10" $
\x -> let n = 10 in (B.unpack *** B.unpack) (B.splitAt n x) === List.genericSplitAt n (B.unpack x)
, testProperty "splitAt (2^31)" $
Expand Down Expand Up @@ -594,13 +608,13 @@ tests =
#endif

, testProperty "index" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.index x (fromIntegral n) === B.unpack x !! n
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.index x (intToIndexTy n) === B.unpack x !! n
, testProperty "indexMaybe" $
\(NonNegative n) x -> fromIntegral n < B.length x ==> B.indexMaybe x (fromIntegral n) === Just (B.unpack x !! n)
\(NonNegative n) x -> intToIndexTy n < B.length x ==> B.indexMaybe x (intToIndexTy n) === Just (B.unpack x !! n)
, testProperty "indexMaybe Nothing" $
\n x -> (n :: Int) < 0 || fromIntegral n >= B.length x ==> B.indexMaybe x (fromIntegral n) === Nothing
\n x -> n < 0 || intToIndexTy n >= B.length x ==> B.indexMaybe x (intToIndexTy n) === Nothing
, testProperty "!?" $
\n x -> B.indexMaybe x (fromIntegral (n :: Int)) === x B.!? (fromIntegral n)
\(intToIndexTy -> n) x -> B.indexMaybe x n === x B.!? n

#ifdef BYTESTRING_CHAR8
, testProperty "isString" $
Expand Down
19 changes: 19 additions & 0 deletions tests/QuickCheckUtils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module QuickCheckUtils
, String8(..)
, CByteString(..)
, Sqrt(..)
, int64OK
) where

import Test.Tasty.QuickCheck
Expand Down Expand Up @@ -112,3 +113,21 @@ instance Arbitrary SB.ShortByteString where

instance CoArbitrary SB.ShortByteString where
coarbitrary s = coarbitrary (SB.unpack s)

-- | This /poison instance/ exists to make accidental mis-use
-- of the @Arbitrary Int64@ instance a bit less likely.
instance {-# OVERLAPPING #-} Testable (Int64 -> prop) where
clyring marked this conversation as resolved.
Show resolved Hide resolved
property = error $ unlines [
"Found a test taking a raw Int64 argument.",
"'instance Arbitrary Int64' by default is likely to",
"produce very large numbers after the first few tests,",
"which doesn't make great indices into a LazyByteString.",
"For indices, try 'intToIndexTy' in Properties/ByteString.hs.",
"",
"If very few small-numbers tests is OK, use",
"'int64OK' to bypass this poison-instance."]

-- | Use this to bypass the poison instance for @Testable (Int64 -> prop)@
-- defined in "QuickCheckUtils".
int64OK :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property
int64OK f = propertyForAllShrinkShow arbitrary shrink (\v -> [show v]) f
Loading
Loading