From 329ddd7c3cd5461e7465e879b8df7b188b5fc5d1 Mon Sep 17 00:00:00 2001 From: Navin Keswani Date: Thu, 30 Mar 2017 21:33:44 +1100 Subject: [PATCH] Implements split and merge commands. --- README.md | 43 ++++++- ambiata-regiment.cabal | 1 + main/regiment.hs | 60 +++++++-- src/Regiment/Data.hs | 2 +- src/Regiment/IO.hs | 84 +++++++++++-- src/Regiment/Parse.hs | 18 +-- test/Test/IO/Regiment/IO.hs | 186 ++++++++++++++++------------ test/Test/Regiment/Arbitrary.hs | 73 ++++++----- test/Test/Regiment/Vanguard/List.hs | 5 + test/cli/basic-usage/run | 35 ++++++ 10 files changed, 363 insertions(+), 144 deletions(-) diff --git a/README.md b/README.md index 3dd0ac8..220ea18 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ A command line tool for sorting standardized separated files ## Usage -------- +`sort` a standardized separated file. + ``` # specify: # column to sort on (mandatory) @@ -48,4 +50,43 @@ regiment sort -k 5 -c 15 -f ',' -o "path/to/output-file" input-file regiment sort -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -o "path/to/output-file" input-file ``` -Note: `regiment` requires local storage roughly equivalent to the size of the inputs, and follows unix `TMPDIR` conventions for that storage. +`split` a standardized separated file into a set of temporary files, each of which is sorted, +and is in regiment's [binary format](doc/temp-file-format.md) + +``` +# specify: +# same options as for sort (except for --output) +# a directory within which to write the sorted splits (mandatory) +# NOTE: this directory must not exist, it will be created for you +regiment split --dir "path/to/output-dir" input-file +regiment split -d "path/to/output-dir" input-file +``` + +Given the format of an input standardized separated file, merge a set of sorted temporary files +(in regiment's [binary format](doc/temp-file-format.md)) into an output-file (that has the same format +as the input standardized separated file). + +``` +# specify: +# directories containing sorted splits that require merging (typically outputs of running split) +# output file (optional) -- defaults to stdout +regiment merge-tmps dir1 dir2 ... dirn + +# explicity specify path to output file -- defaults to stdout +regiment merge-tmps --output "path/to/output-file" dir1 dir2 ... dirn +regiment merge-tmps -o "path/to/output-file" dir1 dir2 ... dirn +``` + +Relationship between `sort`, `split` and `merge-tmps` + +``` +regiment sort -k 1 -c 5 -f ',' --standardized input-file + +generates the same output as + +regiment split -k 1 -c 5 -f ',' --standardized -d "/foo/bar/baz" input-file +regiment merge-tmps "/foo/bar/baz" +``` + +Note: `regiment` requires local storage roughly equivalent to the size of the inputs, +and follows unix `TMPDIR` conventions for that storage. diff --git a/ambiata-regiment.cabal b/ambiata-regiment.cabal index 754a2ec..4ae69c3 100644 --- a/ambiata-regiment.cabal +++ b/ambiata-regiment.cabal @@ -21,6 +21,7 @@ library , binary >= 0.7 && < 0.9 , bytestring == 0.10.* , directory == 1.2.* + , exceptions >= 0.6 && < 0.9 , filepath == 1.3.* , heaps == 0.3.* , primitive == 0.6.* diff --git a/main/regiment.hs b/main/regiment.hs index d40af8d..88b60a8 100644 --- a/main/regiment.hs +++ b/main/regiment.hs @@ -6,7 +6,7 @@ import DependencyInfo_ambiata_regiment import qualified Data.Attoparsec.Text as A import Data.Char (ord) -import Data.Text as T +import qualified Data.Text as T import Data.Word (Word8) import Options.Applicative @@ -33,22 +33,44 @@ main = do SortCommand inn out nc sc sep m f n -> orDie renderRegimentIOError $ regiment inn out sc f n nc sep m + SplitCommand inn nc sc sep m f n tmp -> + orDie renderRegimentIOError $ do + createDirectory tmp + firstT RegimentIOParseError $ + split inn tmp sc f n nc sep m + MergeCommand out dirs -> + orDie renderRegimentIOError $ + mergeDirs dirs out parser :: Parser Command parser = subparser $ - command' "sort" "Sort input file based on sort column(s)." - (SortCommand <$> inputFileP - <*> optional outputP - <*> numColumnsP - <*> some sortColumnP - <*> separatorP - <*> memP - <*> formatP - <*> newlineP) + command' "sort" "Sort input file based on sort column(s)." + (SortCommand <$> inputFileP + <*> optional outputP + <*> numColumnsP + <*> some sortColumnP + <*> separatorP + <*> memP + <*> formatP + <*> newlineP) + <> command' "split" "Split input file into sorted chunks (intermediate state of sort)." + (SplitCommand <$> inputFileP + <*> numColumnsP + <*> some sortColumnP + <*> separatorP + <*> memP + <*> formatP + <*> newlineP + <*> tempDirectoryArgP) + <> command' "merge-tmps" "Merge sorted temp files (output of split)." + (MergeCommand <$> optional outputP + <*> some tempDirectoryP) data Command = - SortCommand InputFile (Maybe OutputFile) NumColumns [SortColumn] Separator MemoryLimit FormatKind Newline + SortCommand InputFile (Maybe OutputFile) NumColumns [SortColumn] Separator MemoryLimit FormatKind Newline + | SplitCommand InputFile NumColumns [SortColumn] Separator MemoryLimit FormatKind Newline TempDirectory + | MergeCommand (Maybe OutputFile) [TempDirectory] deriving (Eq, Show) inputFileP :: Parser InputFile @@ -65,6 +87,22 @@ outputP = OutputFile <$> (strOption $ <> metavar "FILE" <> help "Optional path to output file -- defaults to stdout.") +tempDirectoryP :: Parser TempDirectory +tempDirectoryP = + fmap TempDirectory . strArgument . mconcat $ [ + metavar "TMP_DIRECTORY" + , help "Path to directory containing intermediate sorted files." + ] + +tempDirectoryArgP :: Parser TempDirectory +tempDirectoryArgP = + fmap TempDirectory . strOption . mconcat $ [ + long "dir" + , short 'd' + , metavar "TMP_DIRECTORY" + , help "Path to directory to write out intermediate sorted files." + ] + sortColumnP :: Parser SortColumn sortColumnP = fmap (SortColumn . (\k -> k - 1)) . option auto . mconcat $ [ diff --git a/src/Regiment/Data.hs b/src/Regiment/Data.hs index 3d10a32..2240d58 100644 --- a/src/Regiment/Data.hs +++ b/src/Regiment/Data.hs @@ -112,7 +112,7 @@ countKeyedPayload kp = -- └─────────┴─────────┴─────────┘ data Cursor a = - NonEmpty a KeyedPayload + NonEmpty !a !KeyedPayload | EOF deriving (Show) diff --git a/src/Regiment/IO.hs b/src/Regiment/IO.hs index 7c3d9c0..c7c4939 100644 --- a/src/Regiment/IO.hs +++ b/src/Regiment/IO.hs @@ -2,15 +2,22 @@ {-# LANGUAGE OverloadedStrings #-} module Regiment.IO ( RegimentIOError (..) + , mergeDirs , regiment , renderRegimentIOError + , split , open + , createDirectory ) where -import Control.Monad.IO.Class (liftIO) +import Control.Exception (SomeException) +import Control.Monad.Catch (MonadCatch) +import Control.Monad.IO.Class (liftIO, MonadIO) import Control.Monad.Trans.Resource (MonadResource (..)) import qualified Control.Monad.Trans.Resource as R +import qualified Data.Text as T + import P import Regiment.Data @@ -18,16 +25,19 @@ import Regiment.Parse import Regiment.Vanguard.Base import Regiment.Vanguard.IO -import System.Directory (getDirectoryContents) +import qualified System.Directory as SD import System.FilePath (()) -import System.IO (IO, stdout, IOMode (..), FilePath, Handle, hClose, openBinaryFile) +import System.IO (IO, stdout, IOMode (..), FilePath, Handle, hClose, openBinaryFile, hFlush) import System.IO.Temp (withSystemTempDirectory) -import X.Control.Monad.Trans.Either (EitherT, newEitherT, mapEitherT, runEitherT, firstEitherT) +import X.Control.Monad.Trans.Either (EitherT, newEitherT, mapEitherT, left) +import X.Control.Monad.Trans.Either (runEitherT, firstEitherT, tryEitherT) data RegimentIOError = RegimentIOParseError RegimentParseError | RegimentIOMergeError (RegimentMergeError RegimentMergeIOError) + | RegimentIOCreateDirectoryError TempDirectory Text + | RegimentIOTmpDirExistsError TempDirectory deriving (Eq, Show) renderRegimentIOError :: RegimentIOError -> Text @@ -37,6 +47,15 @@ renderRegimentIOError err = renderRegimentParseError e RegimentIOMergeError e -> renderRegimentMergeError renderRegimentMergeIOError e + RegimentIOCreateDirectoryError (TempDirectory tmp) t -> + "Failed to create directory " + <> T.pack tmp + <> "Error: " <> t + RegimentIOTmpDirExistsError (TempDirectory tmp) -> + "Failed to create directory " + <> T.pack tmp + <> "\n" + <> "Target directory should not exist, it will be created and filled." regiment :: InputFile @@ -49,6 +68,24 @@ regiment :: -> MemoryLimit -> EitherT RegimentIOError IO () regiment inn out sc f n nc sep m = do + firstEitherT RegimentIOParseError . newEitherT $ + withSystemTempDirectory "regiment." $ \tmp -> + R.runResourceT . runEitherT $ do + mapEitherT liftIO $ split inn (TempDirectory tmp) sc f n nc sep m + handles <- openDir (TempDirectory tmp) + mapEitherT liftIO . firstT RegimentParseMergeError $ merge handles out + +split :: + InputFile + -> TempDirectory + -> [SortColumn] + -> FormatKind + -> Newline + -> NumColumns + -> Separator + -> MemoryLimit + -> EitherT RegimentParseError IO () +split inn tmp sc f n nc sep m = do let fmt = Format { @@ -57,26 +94,47 @@ regiment inn out sc f n nc sep m = do , formatColumnCount = numColumns nc , formatKind = f } - firstEitherT RegimentIOParseError . newEitherT $ - withSystemTempDirectory "regiment." $ \tmp -> - runEitherT $ do - toTempFiles inn (TempDirectory tmp) fmt sc m - firstT RegimentParseMergeError $ merge (TempDirectory tmp) out + toTempFiles inn tmp fmt sc m + +mergeDirs :: + [TempDirectory] + -> Maybe OutputFile + -> EitherT RegimentIOError IO () +mergeDirs dirs out = firstT RegimentIOMergeError $ + mapEitherT R.runResourceT $ do + handles <- mapM openDir dirs + mapEitherT liftIO $ merge (concat handles) out merge :: - TempDirectory + [Handle] -> Maybe OutputFile -> EitherT (RegimentMergeError RegimentMergeIOError) IO () -merge (TempDirectory tmp) out = mapEitherT R.runResourceT $ do - fs <- liftIO $ fmap (filter (flip notElem [".", ".."])) $ getDirectoryContents tmp - handles <- mapM (open ReadMode) $ fmap (tmp ) fs +merge handles out = mapEitherT R.runResourceT $ do v <- mapEitherT liftIO $ formVanguardIO handles out' <- case out of Just (OutputFile o) -> open WriteMode o Nothing -> return stdout mapEitherT liftIO $ runVanguardIO v out' + liftIO $ hFlush out' + +openDir :: MonadResource m => TempDirectory -> m [Handle] +openDir (TempDirectory tmp) = do + fs <- liftIO $ fmap (filter (flip notElem [".", ".."])) $ SD.getDirectoryContents tmp + handles <- mapM (open ReadMode) $ fmap (tmp ) fs + return handles open :: MonadResource m => IOMode -> FilePath -> m Handle open m f = do snd <$> R.allocate (openBinaryFile f m) hClose +createDirectory :: (MonadIO m, MonadCatch m) => TempDirectory -> EitherT RegimentIOError m () +createDirectory t@(TempDirectory tmp) = do + exists <- liftIO $ SD.doesDirectoryExist tmp + if exists + then + left $ RegimentIOTmpDirExistsError t + else + tryEitherT handler . liftIO $ SD.createDirectoryIfMissing True tmp + where + handler :: SomeException -> RegimentIOError + handler e = RegimentIOCreateDirectoryError t $ T.pack (show e) diff --git a/src/Regiment/Parse.hs b/src/Regiment/Parse.hs index 2d2f991..3439926 100644 --- a/src/Regiment/Parse.hs +++ b/src/Regiment/Parse.hs @@ -25,6 +25,7 @@ import qualified Data.Vector.Algorithms.Tim as Tim import P import qualified Parsley.Xsv.Parser as Parsley +import qualified Parsley.Xsv.Render as Parsley import Regiment.Data import Regiment.Serial @@ -93,8 +94,7 @@ toTempFiles (InputFile inn) tmpDir f sc (MemoryLimit cap) = do >> go counter (drops + 1) partNum memCounter rest) (Parsley.Success $ \rest fields -> let - bytesParsed = BS.take (BS.length bytes - BS.length rest) bytes - sko = selectSortKeys bytesParsed (Parsley.getFields fields) sc + sko = selectSortKeys fields f sc in case sko of Left _ -> @@ -116,20 +116,22 @@ toTempFiles (InputFile inn) tmpDir f sc (MemoryLimit cap) = do liftIO (BS.hGetSome h innChunkSize) >>= go (0 :: Int) (0 :: Int) (0 :: Int) (0 :: Int) selectSortKeys :: - BS.ByteString - -> (Boxed.Vector BS.ByteString) + Parsley.Fields + -> Format -> [SortColumn] -> Either RegimentParseError (Boxed.Vector BS.ByteString) -selectSortKeys bytes parsed sortColumns = +selectSortKeys fields fmt sortColumns = let + parsed = Parsley.getFields fields + unparsed = Parsley.renderRow fmt parsed <> (Parsley.renderNewline $ formatNewline fmt) maybeSortkeys = L.map (\sc -> parsed Boxed.!? (sortColumn sc)) sortColumns ks = DM.catMaybes maybeSortkeys keyNotFound = and $ L.map isNothing maybeSortkeys - in do + in case keyNotFound of True -> Left RegimentParseKeyNotFound -- returns a vector consisting of keys and payload - False -> Right $ (Boxed.fromList ks) Boxed.++ (Boxed.singleton bytes) + False -> Right $ (Boxed.fromList ks) Boxed.++ (Boxed.singleton unparsed) flushVector :: Grow.Grow Boxed.MVector (PrimState IO) (Boxed.Vector BS.ByteString) @@ -157,7 +159,7 @@ writeChunk h vs = kp <- hoistEither $ vecToKP bs liftIO $ writeCursor h kp if Boxed.null tl - then return () + then liftIO $ IO.hFlush h >> return () else writeChunk h tl writeCursor :: IO.Handle -> KeyedPayload -> IO () diff --git a/test/Test/IO/Regiment/IO.hs b/test/Test/IO/Regiment/IO.hs index 107e505..9fe4326 100644 --- a/test/Test/IO/Regiment/IO.hs +++ b/test/Test/IO/Regiment/IO.hs @@ -29,11 +29,10 @@ import Regiment.Parse import Regiment.Serial import Regiment.Vanguard.IO -import System.Directory (getDirectoryContents) import System.Exit -import System.FilePath (()) +import System.FilePath ((), takeBaseName) import System.IO (FilePath, IO, Handle, IOMode (..), hIsEOF, hClose, withBinaryFile) -import System.IO.Temp (withTempFile, withTempDirectory) +import System.IO.Temp (withTempDirectory, openTempFile) import System.Process (readProcessWithExitCode, createProcess, proc, waitForProcess, env) import Test.Parsley.Arbitrary @@ -41,7 +40,7 @@ import Test.Regiment.Arbitrary import Test.QuickCheck.Instances () import Test.QuickCheck.Jack (suchThat, property, forAllProperties, quickCheckWithResult) -import Test.QuickCheck.Jack (vectorOf, maxSuccess, stdArgs, counterexample, (===), mkJack_) +import Test.QuickCheck.Jack (listOfN, maxSuccess, stdArgs, counterexample, (===), mkJack_) import X.Control.Monad.Trans.Either (EitherT, newEitherT, runEitherT) @@ -78,81 +77,112 @@ prop_roundtrip_write_read_line = return $ result === expected -prop_roundtrip_write_read_sorted_tmp_file = - gamble (arbitrary `suchThat` (> 0)) $ \n -> - gamble (arbitrary `suchThat` (\f -> (formatColumnCount f) > 0)) $ \fmt -> - gamble (genListSortColumns fmt) $ \sc -> - gamble (vectorOf n $ genRealKP fmt sc) $ \kps -> - testIO . withTempFile "dist" "test-input-" $ \tmpFile hFile -> do - -- write rs to InputFile - -- note that our payloads are terminated by newlines so no need to - -- add terminating newline - BS.hPut hFile $ BS.concat (payload <$> kps) - hClose hFile - -- sort the rs in memory - let expected = payload <$> DL.sort kps - - -- toTempFiles on the input file that was just created - withTempDirectory "dist" "regiment-test" $ \tmp -> do - success <- runEitherT $ toTempFiles (InputFile tmpFile) (TempDirectory tmp) fmt sc (MemoryLimit (1024 * 1024)) - case success of - Left e -> - return $ counterexample ("toTempFiles errored out: " <> show e) False - Right _ -> do - -- read the contents of the sorted temp files - tmpFilePaths <- fmap (filter (flip notElem [".", ".."])) $ getDirectoryContents tmp - mresult <- runEitherT . readPayloads $ fmap (tmp ) tmpFilePaths - return $ case mresult of - Left _ -> counterexample "RegimentIOError" False - Right result -> expected === result - prop_regiment = - gamble (arbitrary `suchThat` (> 0)) $ \n -> - gamble genNonNullSeparator $ \sep -> - gamble (genRestrictedFormat sep) $ \fmt -> - gamble (genListSortColumns fmt) $ \sc -> - gamble (vectorOf n $ (mkJack_ $ genRow fmt) `suchThat` (not . BS.null)) $ \rs -> do - let inp = unlines fmt rs - testIO . withTempFile "dist" "test-input-" $ \tmpFile hFile -> do - BS.hPut hFile inp - hClose hFile - - withTempDirectory "dist" "regiment-test." $ \tmp -> do - success <- runEitherT $ regiment (InputFile tmpFile) - (Just . OutputFile $ tmp "regiment-sorted") - sc - (formatKind fmt) - (formatNewline fmt) - (NumColumns (formatColumnCount fmt)) - (formatSeparator fmt) - (MemoryLimit (1024 * 1024)) - case success of - Left e -> - return $ counterexample ("regiment errored out: " <> show e) False - Right _ -> do - let - -- below is to end up with key options for sort so that - -- e.g. sort cols of 2, 4 correspond to - -- "-k", "2,2", "-k", "4,4" - sc' = ((\k -> k + 1) . sortColumn) <$> sc - scs = [fmap (\s -> DL.filter (\c -> c /= '(' && c /= ')') s) $ show <$> (DL.zip sc' sc')] - ks = DL.concat . DL.transpose $ [DL.replicate (DL.length sc) (T.unpack "-k")] DL.++ scs - sepChar = BSC.unpack . BS.singleton . renderSeparator $ formatSeparator fmt - - (_, _, _, pr) <- createProcess ( proc "sort" - $ ks <> ["-t", sepChar, "-o", tmp "gnu-sorted", tmpFile] - ) { env = Just [("LC_COLLATE", "C")] } - ex <- waitForProcess pr - case ex of - ExitFailure _ -> return $ counterexample "gnu-sort errorred out" False - ExitSuccess -> do - (ex', so', _se') <- readProcessWithExitCode "diff" [ "-c" - , tmp "regiment-sorted" - , tmp "gnu-sorted" - ] "" - case ex' of - ExitSuccess -> return $ property True - ExitFailure _ -> return $ counterexample ("diff failed: " <> so') False + gamble genRestrictedFormat $ \fmt -> + gamble (genSortColumns fmt) $ \sc -> + gamble (listOfN 1 100 $ (mkJack_ $ genRow fmt) `suchThat` (not . BS.null)) $ \rs -> testIO $ do + withTempDirectory "dist" "regiment-test." $ \tmp -> do + tmpFile <- writeToTmpFile tmp fmt rs + success <- runEitherT $ regiment (InputFile tmpFile) + (Just . OutputFile $ tmp "regiment-sorted") + sc + (formatKind fmt) + (formatNewline fmt) + (NumColumns (formatColumnCount fmt)) + (formatSeparator fmt) + (MemoryLimit (1024 * 1024)) + case success of + Left e -> + return $ counterexample ("regiment errored out: " <> show e) False + Right _ -> do + let + -- below is to end up with key options for sort so that + -- e.g. sort cols of 2, 4 correspond to + -- "-k", "2,2", "-k", "4,4" + sc' = ((\k -> k + 1) . sortColumn) <$> sc + scs = [fmap (\s -> DL.filter (\c -> c /= '(' && c /= ')') s) $ show <$> (DL.zip sc' sc')] + ks = DL.concat . DL.transpose $ [DL.replicate (DL.length sc) (T.unpack "-k")] DL.++ scs + sepChar = BSC.unpack . BS.singleton . renderSeparator $ formatSeparator fmt + + (_, _, _, pr) <- createProcess ( proc "sort" + $ ks <> ["-t", sepChar, "-o", tmp "gnu-sorted", tmpFile] + ) { env = Just [("LC_COLLATE", "C")] } + ex <- waitForProcess pr + case ex of + ExitFailure _ -> return $ counterexample "gnu-sort errorred out" False + ExitSuccess -> do + (ex', so', _se') <- readProcessWithExitCode "diff" [ "-c" + , tmp "regiment-sorted" + , tmp "gnu-sorted" + ] "" + case ex' of + ExitSuccess -> return $ property True + ExitFailure _ -> return $ counterexample ("diff failed: " <> so') False + +prop_regiment_split_merge = + gamble genFormat $ \fmt -> + gamble (listOfN 1 5 $ listOfN 1 100 $ (mkJack_ $ genRow fmt) `suchThat` (not . BS.null)) $ \vrs -> testIO $ do + -- sort on all columns to avoid having to deal with differences + -- in ordering of rows when sort keys are the same. + let sc = SortColumn <$> [0 .. ((formatColumnCount fmt) - 1)] + withTempDirectory "dist" "regiment-test." $ \tmp -> do + chunkFiles <- mapM (writeToTmpFile tmp fmt) vrs + inn <- writeToTmpFile tmp fmt $ concat vrs + success <- runEitherT $ + regiment (InputFile inn) + (Just . OutputFile $ tmp "regiment-sorted") + sc + (formatKind fmt) + (formatNewline fmt) + (NumColumns (formatColumnCount fmt)) + (formatSeparator fmt) + (MemoryLimit (1024 * 1024)) + + case success of + Left e -> + return $ counterexample ("regiment errored out: " <> show e) False + Right _ -> do + splitSuccess <- + mapM (\nm -> runEitherT $ do + let tmpDir = TempDirectory $ tmp "splits" (takeBaseName nm) + createDirectory tmpDir + firstT RegimentIOParseError $ + split + (InputFile nm) + tmpDir + sc + (formatKind fmt) + (formatNewline fmt) + (NumColumns (formatColumnCount fmt)) + (formatSeparator fmt) + (MemoryLimit (1024 * 1024))) chunkFiles + case null $ lefts splitSuccess of + False -> + return $ counterexample ("split errored out " <> show splitSuccess) False + True -> do + mergeSuccess <- + runEitherT $ + mergeDirs (fmap (\nm -> TempDirectory $ tmp "splits" (takeBaseName nm)) chunkFiles) + (Just . OutputFile $ tmp "regiment-split-merge") + case mergeSuccess of + Left e -> + return $ counterexample ("merge errored out: " <> show e) False + Right _ -> do + (ex, so, _se) <- readProcessWithExitCode "diff" [ "-c" + , tmp "regiment-sorted" + , tmp "regiment-split-merge" + ] "" + case ex of + ExitSuccess -> return $ property True + ExitFailure _ -> return $ counterexample ("diff failed: " <> so) False + +writeToTmpFile :: FilePath -> Format -> [BS.ByteString] -> IO FilePath +writeToTmpFile tmp fmt vrs = do + let inp = unlines fmt vrs + (tmpFile, hFile) <- openTempFile tmp "test-input-" + BS.hPut hFile inp + hClose hFile + return tmpFile unlines fmt ls = let nl = renderNewline . formatNewline $ fmt diff --git a/test/Test/Regiment/Arbitrary.hs b/test/Test/Regiment/Arbitrary.hs index 9b2cd50..8307e0b 100644 --- a/test/Test/Regiment/Arbitrary.hs +++ b/test/Test/Regiment/Arbitrary.hs @@ -40,6 +40,35 @@ genKP numKeys = <$> Boxed.fromList <$> (vectorOf numKeys genKey) <*> genBytes +genRealKP :: Format -> [SortColumn] -> Jack KeyedPayload +genRealKP fmt sc = do + -- gen KeyedPayload using the gens from Parsley so that + -- we end up with an actual delimited or standardized payload + + -- assume that sc is legit - i.e of length < formatColumnCount and + -- a subset of 1 .. formatColumnCount + fields <- vectorOf (formatColumnCount fmt) (genBytes) + let + sortkeys = DL.map (\i -> Key $ fields DL.!! (sortColumn i)) sc + return $ KeyedPayload { + keys = Boxed.fromList sortkeys + , payload = Parsley.renderRow fmt (Boxed.fromList fields) <> (renderNewline $ formatNewline fmt) + } + +genListKPsUniqueKeys :: Jack [[KeyedPayload]] +genListKPsUniqueKeys = do + numKeys <- arbitrary `suchThat` (> 0) + numLists <- chooseInt (1,10) + maxListLength <- arbitrary `suchThat` (> 0) + forM [1 .. numLists] $ \i -> + genKPsUniqueKeys i numKeys maxListLength + +genListKPsNoPayload :: Jack [[KeyedPayload]] +genListKPsNoPayload = do + numKeys <- arbitrary `suchThat` (> 0) + numLists <- chooseInt (1,10) + vectorOf numLists $ listOfN 0 numLists (genKPNoPayload numKeys) + genKPNoPayload :: Int -> Jack KeyedPayload genKPNoPayload numKeys = do bs <- vectorOf numKeys genBytes @@ -51,12 +80,6 @@ genKPNoPayload numKeys = do , payload = p } -genListKPsNoPayload :: Jack [[KeyedPayload]] -genListKPsNoPayload = do - numKeys <- arbitrary `suchThat` (> 0) - numLists <- chooseInt (1,10) - vectorOf numLists $ listOfN 0 numLists (genKPNoPayload numKeys) - genKPsUniqueKeys :: Int -> Int -> Int -> Jack [KeyedPayload] genKPsUniqueKeys prefix numKeys maxListLength = do kps <- listOfN 0 maxListLength (genKP numKeys) @@ -71,14 +94,6 @@ genKPsUniqueKeys prefix numKeys maxListLength = do return $ prepend uniquifier <$> kps -genListKPsUniqueKeys :: Jack [[KeyedPayload]] -genListKPsUniqueKeys = do - numKeys <- arbitrary `suchThat` (> 0) - numLists <- chooseInt (1,10) - maxListLength <- arbitrary `suchThat` (> 0) - forM [1 .. numLists] $ \i -> - genKPsUniqueKeys i numKeys maxListLength - genCursor :: Int -> Handle -> Jack (Cursor Handle) genCursor n h = oneof [ @@ -86,8 +101,8 @@ genCursor n h = , NonEmpty <$> return h <*> (genKP n) ] -genListSortColumns :: Format -> Jack [SortColumn] -genListSortColumns fmt = do +genSortColumns :: Format -> Jack [SortColumn] +genSortColumns fmt = do sortcols <- sublistOf [0 .. ((formatColumnCount fmt) - 1)] `suchThat` (not . null) return $ SortColumn <$> sortcols @@ -102,20 +117,6 @@ genField fmt = Standardized -> do genStandardizedField strBSlistOf1 sep -genRealKP :: Format -> [SortColumn] -> Jack KeyedPayload -genRealKP fmt sc = do - -- gen KeyedPayload using the gens from Parsley so that - -- we end up with an actual delimited or standardized payload - - -- assume that sc is legit - i.e of length < formatColumnCount and - -- a subset of 1 .. formatColumnCount - fields <- vectorOf (formatColumnCount fmt) (genField fmt) - let - sortkeys = DL.map (\i -> Key $ fields DL.!! (sortColumn i)) sc - return $ KeyedPayload { - keys = Boxed.fromList sortkeys - , payload = Parsley.renderRow fmt (Boxed.fromList fields) <> (renderNewline $ formatNewline fmt) - } strBSlistOf1 :: QC.Gen BS.ByteString strBSlistOf1 = fmap BSC.pack . QC.listOf1 . QC.elements $ @@ -125,9 +126,17 @@ genNonNullSeparator :: Jack Separator genNonNullSeparator = arbitrary `suchThat` (\sep -> sep /= (Separator . fromIntegral $ ord '\NUL')) +genFormat :: Jack Format +genFormat = do + sep <- genNonNullSeparator + arbitrary `suchThat` (\fmt -> + (formatColumnCount fmt) > 0 + && (formatSeparator fmt) == sep + ) -genRestrictedFormat :: Separator -> Jack Format -genRestrictedFormat sep = +genRestrictedFormat :: Jack Format +genRestrictedFormat = do + sep <- genNonNullSeparator arbitrary `suchThat` (\fmt -> (formatColumnCount fmt) > 0 && (formatKind fmt) == Delimited diff --git a/test/Test/Regiment/Vanguard/List.hs b/test/Test/Regiment/Vanguard/List.hs index de86224..e218cee 100644 --- a/test/Test/Regiment/Vanguard/List.hs +++ b/test/Test/Regiment/Vanguard/List.hs @@ -37,6 +37,11 @@ prop_runVanguard_unique_keys = prop_runVanguard_possible_dupe_sortkeys :: Property prop_runVanguard_possible_dupe_sortkeys = + -- by artificially removing payloads we are able to avoid + -- test that runVanguard is the same as DL.sort. Retaining + -- payloads means that we have to accomodate differences in + -- the ordering of payloads between runVanguard and DL.sort + -- when sort keys are exactly the same. runVanguardOn genListKPsNoPayload return [] diff --git a/test/cli/basic-usage/run b/test/cli/basic-usage/run index dba81df..3b79c99 100755 --- a/test/cli/basic-usage/run +++ b/test/cli/basic-usage/run @@ -52,3 +52,38 @@ $REGIMENT sort -k 5 -c 15 -f ',' -o "path/to/output-file" input-file # all the things $REGIMENT sort -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -o "path/to/output-file" input-file + +# split +$REGIMENT split --key 5 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file +$REGIMENT split -k 5 -c 15 -f ',' -d "path/to/output-dir" input-file + +# specify multiple columns to sort on +$REGIMENT split --key 1 --key 5 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file +$REGIMENT split -k 1 -k 5 -c 15 -f ',' -d "path/to/output-dir" input-file + +# explicitly specify that format of input-file is standardized +$REGIMENT split --standardized --key 1 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file +$REGIMENT split --standardized -k 1 -c 15 -f ',' -d "path/to/output-dir" input-file + +# explicitly specify memory upper bound of 2 GB (default unit is MB - use G to specify GB) +$REGIMENT split --mem-max 2G --key 1 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file +$REGIMENT split -m 2G -k 1 -c 15 -f ',' -d "path/to/output-dir" input-file +$REGIMENT split -m 2000 -k 1 -c 15 -f ',' -d "path/to/output-dir" input-file + +# explicitly specify newline -- one of LF, CR or CRLF (defaults to LF) +$REGIMENT split --key 5 --number-columns 15 --field-separator ',' --crlf --dir "path/to/output-dir" input-file +$REGIMENT split -k 5 -c 15 -f ',' --crlf --dir "path/to/output-dir" input-file + +# all the things +$REGIMENT split -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -d "/path/to/output-dir" input-file + +# merge-tmps + +# write to stdout +$REGIMENT merge-tmps "/path/to/dir1" "/path/to/dir2" + +# specify output file +$REGIMENT merge-tmps --output "path/to/output-file" "/path/to/dir1" "/path/to/dir2" +$REGIMENT merge-tmps -o "path/to/output-file" "/path/to/dir1" "/path/to/dir2" + +