How to compress output when writing to a file?

I have a calculation that, along with other things, generates some data (many of them), and I want to write to a file.

A way to structure the code now (simplified):

writeRecord :: Handle -> Record -> IO ()
writeRecord h r = hPutStrLn h (toByteString r)

This function is then called periodically during a larger calculation. It is almost like a log, and in fact, several files are written simultaneously.

Now I want the output file to be compressed with Gzip. In languages ​​like Java, I would do something like:

outStream = new GzipOutputStream(new FileOutputStream(path)) 

and then just write to this completed output stream.

What is the way to do this in Haskell? I think something like

writeRecord h r = hPut h ((compressed . toByteString) r)

incorrect because the compression of each small bit is individually inefficient (I even tried it and the size of the compressed file is larger than uncompressed in this way).

, ByteString ( ), compressed . fromChunks, , "" . , , .

, Haskell? gzipped?

+4
5

, . , . :

  • hPutStr h yield
  • liftIO
  • withBinaryFile runConduitRes, gzip sinkFile

:

#!/usr/bin/env stack
-- stack --resolver lts-6.21 --install-ghc runghc --package conduit-extra
{-# LANGUAGE OverloadedStrings #-}
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.ByteString (ByteString, hPutStr)
import Data.Conduit (ConduitM, (.|), yield, runConduitRes)
import Data.Conduit.Binary (sinkFile)
import Data.Conduit.Zlib (gzip)
import System.IO (Handle)

-- Some helper function you may have
someAction :: IO ByteString
someAction = return "This is a string\n"

-- Original version
producerHandle :: Handle -> IO ()
producerHandle h = do
    str <- someAction
    hPutStr h str

-- Conduit version
producerConduit :: MonadIO m => ConduitM i ByteString m ()
producerConduit = do
    str <- liftIO someAction
    yield str

main :: IO ()
main = runConduitRes $ producerConduit
                    .| gzip
                    .| sinkFile "some-file.txt.gz"

.

Java , , , .

, Java. SinkFunc.hs, Gist : https://gist.github.com/snoyberg/283154123d30ff9e201ea4436a5dd22d

#!/usr/bin/env stack
-- stack --resolver lts-6.21 --install-ghc runghc --package conduit-extra
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_GHC -Wall -Werror #-}
import Data.ByteString (ByteString)
import Data.Conduit ((.|))
import Data.Conduit.Binary (sinkHandle)
import Data.Conduit.Zlib (gzip)
import System.IO (withBinaryFile, IOMode (WriteMode))
import SinkFunc (withSinkFunc)

-- Some helper function you may have
someAction :: IO ByteString
someAction = return "This is a string\n"

producerFunc :: (ByteString -> IO ()) -> IO ()
producerFunc write = do
    str <- someAction
    write str

main :: IO ()
main = withBinaryFile "some-file.txt.gz" WriteMode $ \h -> do
    let sink = gzip .| sinkHandle h
    withSinkFunc sink $ \write -> producerFunc write

2 , ZipSink . , :

#!/usr/bin/env stack
-- stack --resolver lts-6.21 --install-ghc runghc --package conduit-extra
{-# LANGUAGE OverloadedStrings #-}
import Control.Monad.Trans.Resource (MonadResource)
import Data.ByteString (ByteString)
import Data.Conduit (ConduitM, (.|), yield, runConduitRes, ZipSink (..))
import Data.Conduit.Binary (sinkFile)
import qualified Data.Conduit.List as CL
import Data.Conduit.Zlib (gzip)

data Output = Foo ByteString | Bar ByteString

fromFoo :: Output -> Maybe ByteString
fromFoo (Foo bs) = Just bs
fromFoo _ = Nothing

fromBar :: Output -> Maybe ByteString
fromBar (Bar bs) = Just bs
fromBar _ = Nothing

producer :: Monad m => ConduitM i Output m ()
producer = do
    yield $ Foo "This is going to Foo"
    yield $ Bar "This is going to Bar"

sinkHelper :: MonadResource m
           => FilePath
           -> (Output -> Maybe ByteString)
           -> ConduitM Output o m ()
sinkHelper fp f
    = CL.mapMaybe f
   .| gzip
   .| sinkFile fp

main :: IO ()
main = runConduitRes
     $ producer
    .| getZipSink
            (ZipSink (sinkHelper "foo.txt.gz" fromFoo) *>
             ZipSink (sinkHelper "bar.txt.gz" fromBar))
+3

. , , io-streams . trump clinton, . , pipes Michael conduit

#!/usr/bin/env stack
-- stack --resolver lts-6.21 --install-ghc runghc --package io-streams
{-# LANGUAGE OverloadedStrings #-}

import qualified System.IO.Streams as IOS
import qualified System.IO as IO
import Data.ByteString (ByteString)

analyzer :: IOS.OutputStream ByteString -> IOS.OutputStream ByteString -> IO ()
analyzer clinton trump = do 
  IOS.write (Just "This is a string\n") clinton
  IOS.write (Just "This is a string\n") trump
  IOS.write (Just "Clinton string\n") clinton
  IOS.write (Just "Trump string\n") trump   
  IOS.write (Just "Another Clinton string\n") clinton
  IOS.write (Just "Another Trump string\n") trump   
  IOS.write Nothing clinton
  IOS.write Nothing trump

main:: IO ()
main = 
  IOS.withFileAsOutput "some-file-clinton.txt.gz" $ \clinton_compressed ->
  IOS.withFileAsOutput "some-file-trump.txt.gz" $ \trump_compressed -> do
     clinton <- IOS.gzip IOS.defaultCompressionLevel clinton_compressed
     trump <- IOS.gzip IOS.defaultCompressionLevel trump_compressed
     analyzer clinton trump

, IO analyzer - write s, . , analyzer , write read . (!) , . ,

$ stack gzip_so.hs  
$ gunzip some-file-clinton.txt.gz 
$ gunzip some-file-trump.txt.gz 
$ cat some-file-clinton.txt 
This is a string
Clinton string
Another Clinton string
$ cat some-file-trump.txt 
This is a string
Trump string
Another Trump string

. . , , S:

#!/usr/bin/env stack
-- stack --resolver lts-6.21 --install-ghc runghc  --package pipes-zlib 
{-# LANGUAGE OverloadedStrings #-}
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.ByteString (ByteString, hPutStr)
import System.IO  (IOMode(..), withFile, Handle)
import Pipes  
import qualified Pipes.ByteString as PB
import qualified Pipes.GZip as P

-- Some helper function you may have
someAction :: IO ByteString
someAction = return "This is a string\n"

-- Original version
producerHandle :: Handle -> IO ()
producerHandle h = do
    str <- someAction
    hPutStr h str

producerPipe :: MonadIO m => Producer ByteString m ()
producerPipe = do
    str <- liftIO someAction
    yield str

main :: IO ()
main =  withFile "some-file-pipes.txt.gz"  WriteMode $ \h -> 
     runEffect $ P.compress P.defaultCompression producerPipe  >-> PB.toHandle h 

-

, , , Michael S danidiaz:

#!/usr/bin/env stack
-- stack --resolver lts-6.21 --install-ghc runghc --package pipes-zlib
{-# LANGUAGE OverloadedStrings #-}
import Pipes
import Pipes.GZip
import qualified Pipes.Prelude as P
import qualified Pipes.ByteString as Bytes
import System.IO
import Control.Monad (replicateM_)

producer = replicateM_ 50000 $ do
    marie  "This is going to Marie\n"  -- arbitary IO can be interspersed here
    arthur "This is going to Arthur\n" -- with liftIO
    sylvia "This is going to Sylvia\n" 
  where 
    marie = yield; arthur = lift . yield; sylvia = lift . lift . yield

sinkHelper h p = runEffect (compress bestSpeed p >-> Bytes.toHandle h)

main :: IO ()
main =  
   withFile "marie.txt.gz" WriteMode $ \marie ->
   withFile "arthur.txt.gz"  WriteMode $ \arthur -> 
   withFile "sylvia.txt.gz"  WriteMode $ \sylvia ->
      sinkHelper sylvia
      $ sinkHelper arthur
      $ sinkHelper marie
      $ producer

, , - " ". - streaming.

+5

, , compressIO/foldCompressStream Codec.Compression.Zlib.Internal.

IO (Maybe a) (, MVar take InputStream/Chan read), Nothing , -

import System.IO (Handle)
import qualified Data.ByteString as BS
import qualified Codec.Compression.Zlib.Internal as ZLib

compressedWriter :: Handle -> (IO (Maybe BS.ByteString)) -> IO ()
compressedWriter handle source =
  ZLib.foldCompressStream
    (\next -> source >>= maybe (next BS.empty) next)
    (\chunk next -> BS.hPut handle chunk >> next)
    (return ())
    (ZLib.compressIO ZLib.rawFormat ZLib.defaultCompressParams)
+2

Michael Snoyman EDIT 2, foldl, pipe, pipes-zlib streaming-eversion.

 {-# language OverloadedStrings #-}
module Main where

-- cabal install bytestring foldl pipes pipes-zlib streaming-eversion
import Data.Foldable
import Data.ByteString
import qualified Control.Foldl as L 
import Pipes 
import qualified Pipes.Prelude
import Pipes.Zlib (compress,defaultCompression,defaultWindowBits)
import Streaming.Eversion.Pipes (transvertMIO)
import System.IO

type Tag = String

producer :: Monad m => Producer (Tag,ByteString) m ()
producer = do
    yield $ ("foo","This is going to Foo")
    yield $ ("bar","This is going to Bar")

foldForTag :: Handle -> Tag -> L.FoldM IO (Tag,ByteString) ()
foldForTag handle tag = 
      L.premapM (\(tag',bytes) -> if tag' == tag then Just bytes else Nothing)
    . L.handlesM L.folded
    . transvertMIO (compress defaultCompression defaultWindowBits)
    $ L.mapM_ (Data.ByteString.hPut handle)

main :: IO ()
main = do
    withFile "foo.txt" WriteMode $ \h1 ->
        withFile "bar.txt" WriteMode $ \h2 ->
            let multifold = traverse_ (uncurry foldForTag) [(h1,"foo"),(h2,"bar")] 
            in  L.impurely Pipes.Prelude.foldM multifold producer
+1

Michael Snoyman EDIT 2, streaming, streaming-bytestring, pipes-zlib .

{-# language OverloadedStrings #-}
module Main where

-- cabal install bytestring streaming streaming-bytestring pipes pipes-zlib 
import Data.ByteString
import qualified Data.ByteString.Streaming as B
import Streaming
import qualified Streaming.Prelude as S
import Pipes (next)
import qualified Pipes.Prelude 
import Pipes.Zlib (compress,defaultCompression,defaultWindowBits)
import System.IO

type Tag = String

producer :: Monad m => Stream (Of (Tag,ByteString)) m ()
producer = do
    S.yield ("foo","This is going to Foo")
    S.yield ("bar","This is going to Bar")

-- I couldn't find a streaming-zlib on Hackage, took a pipes detour
compress' :: MonadIO m 
          => Stream (Of ByteString) m r -> Stream (Of ByteString) m r 
compress' = S.unfoldr Pipes.next
          . compress defaultCompression defaultWindowBits
          . Pipes.Prelude.unfoldr S.next     

keepTag :: Monad m 
        => Tag -> Stream (Of (Tag,ByteString)) m r -> Stream (Of ByteString) m r 
keepTag tag = S.map snd . S.filter ((tag==) . fst)

main :: IO ()
main = runResourceT 
     . B.writeFile "foo.txt" . B.fromChunks . compress' .  keepTag "foo"  
     . B.writeFile "bar.txt"  . B.fromChunks . compress' . keepTag "bar"  
     $ S.copy producer

I use the copy function from Streaming.Prelude , which allows you

Duplicate the contents of the stream so that it can be turned on twice in different ways, but without interrupting the streaming.

+1
source

Source: https://habr.com/ru/post/1658314/


All Articles