From 329ddd7c3cd5461e7465e879b8df7b188b5fc5d1 Mon Sep 17 00:00:00 2001
From: Navin Keswani <navin.keswani@ambiata.com>
Date: Thu, 30 Mar 2017 21:33:44 +1100
Subject: [PATCH] Implements split and merge commands.

---
 README.md                           |  43 ++++++-
 ambiata-regiment.cabal              |   1 +
 main/regiment.hs                    |  60 +++++++--
 src/Regiment/Data.hs                |   2 +-
 src/Regiment/IO.hs                  |  84 +++++++++++--
 src/Regiment/Parse.hs               |  18 +--
 test/Test/IO/Regiment/IO.hs         | 186 ++++++++++++++++------------
 test/Test/Regiment/Arbitrary.hs     |  73 ++++++-----
 test/Test/Regiment/Vanguard/List.hs |   5 +
 test/cli/basic-usage/run            |  35 ++++++
 10 files changed, 363 insertions(+), 144 deletions(-)
diff --git a/README.md b/README.md
index 3dd0ac8..220ea18 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ A command line tool for sorting standardized separated files
 ## Usage
 --------
 
+`sort` a standardized separated file.
+
 ```
 # specify:
 #   column to sort on (mandatory)
@@ -48,4 +50,43 @@ regiment sort -k 5 -c 15 -f ',' -o "path/to/output-file" input-file
 regiment sort -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -o "path/to/output-file" input-file
 ```
 
-Note: `regiment` requires local storage roughly equivalent to the size of the inputs, and follows unix `TMPDIR` conventions for that storage.
+`split` a standardized separated file into a set of temporary files, each of which is sorted,
+and is in regiment's [binary format](doc/temp-file-format.md)
+
+```
+# specify:
+#   same options as for sort (except for --output)
+#   a directory within which to write the sorted splits (mandatory)
+#   NOTE: this directory must not exist, it will be created for you
+regiment split <same opts as sort> --dir "path/to/output-dir" input-file
+regiment split <same opts as sort> -d "path/to/output-dir" input-file
+```
+
+Given the format of an input standardized separated file, merge a set of sorted temporary files
+(in regiment's [binary format](doc/temp-file-format.md)) into an output-file (that has the same format
+as the input standardized separated file).
+
+```
+# specify:
+#   directories containing sorted splits that require merging (typically outputs of running split)
+#   output file (optional) -- defaults to stdout
+regiment merge-tmps dir1 dir2 ... dirn
+
+# explicity specify path to output file -- defaults to stdout
+regiment merge-tmps --output "path/to/output-file" dir1 dir2 ... dirn
+regiment merge-tmps -o "path/to/output-file" dir1 dir2 ... dirn
+```
+
+Relationship between `sort`, `split` and `merge-tmps`
+
+```
+regiment sort -k 1 -c 5 -f ',' --standardized input-file 
+
+generates the same output as
+
+regiment split -k 1 -c 5 -f ',' --standardized -d "/foo/bar/baz" input-file
+regiment merge-tmps "/foo/bar/baz"
+```
+
+Note: `regiment` requires local storage roughly equivalent to the size of the inputs,
+and follows unix `TMPDIR` conventions for that storage.
diff --git a/ambiata-regiment.cabal b/ambiata-regiment.cabal
index 754a2ec..4ae69c3 100644
--- a/ambiata-regiment.cabal
+++ b/ambiata-regiment.cabal
@@ -21,6 +21,7 @@ library
                     , binary                          >= 0.7        && < 0.9
                     , bytestring                      == 0.10.*
                     , directory                       == 1.2.*
+                    , exceptions                      >= 0.6        && < 0.9
                     , filepath                        == 1.3.*
                     , heaps                           == 0.3.*
                     , primitive                       == 0.6.*
diff --git a/main/regiment.hs b/main/regiment.hs
index d40af8d..88b60a8 100644
--- a/main/regiment.hs
+++ b/main/regiment.hs
@@ -6,7 +6,7 @@ import           DependencyInfo_ambiata_regiment
 
 import qualified Data.Attoparsec.Text as A
 import           Data.Char (ord)
-import           Data.Text as T
+import qualified Data.Text as T
 import           Data.Word (Word8)
 
 import           Options.Applicative
@@ -33,22 +33,44 @@ main = do
       SortCommand inn out nc sc sep m f n ->
         orDie renderRegimentIOError $
           regiment inn out sc f n nc sep m
+      SplitCommand inn nc sc sep m f n tmp ->
+        orDie renderRegimentIOError $ do
+          createDirectory tmp
+          firstT RegimentIOParseError $
+            split inn tmp sc f n nc sep m
+      MergeCommand out dirs ->
+        orDie renderRegimentIOError $
+          mergeDirs dirs out
 
 parser :: Parser Command
 parser =
   subparser $
-    command' "sort" "Sort input file based on sort column(s)."
-      (SortCommand <$> inputFileP
-                   <*> optional outputP
-                   <*> numColumnsP
-                   <*> some sortColumnP
-                   <*> separatorP
-                   <*> memP
-                   <*> formatP
-                   <*> newlineP)
+     command' "sort" "Sort input file based on sort column(s)."
+       (SortCommand <$> inputFileP
+                    <*> optional outputP
+                    <*> numColumnsP
+                    <*> some sortColumnP
+                    <*> separatorP
+                    <*> memP
+                    <*> formatP
+                    <*> newlineP)
+  <> command' "split" "Split input file into sorted chunks (intermediate state of sort)."
+       (SplitCommand <$> inputFileP
+                     <*> numColumnsP
+                     <*> some sortColumnP
+                     <*> separatorP
+                     <*> memP
+                     <*> formatP
+                     <*> newlineP
+                     <*> tempDirectoryArgP)
+  <> command' "merge-tmps" "Merge sorted temp files (output of split)."
+       (MergeCommand <$> optional outputP
+                     <*> some tempDirectoryP)
 
 data Command =
-  SortCommand InputFile (Maybe OutputFile) NumColumns [SortColumn] Separator MemoryLimit FormatKind Newline
+    SortCommand InputFile (Maybe OutputFile) NumColumns [SortColumn] Separator MemoryLimit FormatKind Newline
+  | SplitCommand InputFile NumColumns [SortColumn] Separator MemoryLimit FormatKind Newline TempDirectory
+  | MergeCommand (Maybe OutputFile) [TempDirectory]
   deriving (Eq, Show)
 
 inputFileP :: Parser InputFile
@@ -65,6 +87,22 @@ outputP = OutputFile <$> (strOption $
   <> metavar "FILE"
   <> help "Optional path to output file -- defaults to stdout.")
 
+tempDirectoryP :: Parser TempDirectory
+tempDirectoryP =
+  fmap TempDirectory . strArgument . mconcat $ [
+      metavar "TMP_DIRECTORY"
+    , help "Path to directory containing intermediate sorted files."
+    ]
+
+tempDirectoryArgP :: Parser TempDirectory
+tempDirectoryArgP =
+  fmap TempDirectory . strOption . mconcat $ [
+      long "dir"
+    , short 'd'
+    , metavar "TMP_DIRECTORY"
+    , help "Path to directory to write out intermediate sorted files."
+    ]
+
 sortColumnP :: Parser SortColumn
 sortColumnP =
   fmap (SortColumn . (\k -> k - 1)) . option auto . mconcat $ [
diff --git a/src/Regiment/Data.hs b/src/Regiment/Data.hs
index 3d10a32..2240d58 100644
--- a/src/Regiment/Data.hs
+++ b/src/Regiment/Data.hs
@@ -112,7 +112,7 @@ countKeyedPayload kp =
 --   └─────────┴─────────┴─────────┘
 
 data Cursor a =
-    NonEmpty a KeyedPayload
+    NonEmpty !a !KeyedPayload
   | EOF
   deriving (Show)
 
diff --git a/src/Regiment/IO.hs b/src/Regiment/IO.hs
index 7c3d9c0..c7c4939 100644
--- a/src/Regiment/IO.hs
+++ b/src/Regiment/IO.hs
@@ -2,15 +2,22 @@
 {-# LANGUAGE OverloadedStrings #-}
 module Regiment.IO (
     RegimentIOError (..)
+  , mergeDirs
   , regiment
   , renderRegimentIOError
+  , split
   , open
+  , createDirectory
   ) where
 
-import           Control.Monad.IO.Class (liftIO)
+import           Control.Exception (SomeException)
+import           Control.Monad.Catch (MonadCatch)
+import           Control.Monad.IO.Class (liftIO, MonadIO)
 import           Control.Monad.Trans.Resource (MonadResource (..))
 import qualified Control.Monad.Trans.Resource as R
 
+import qualified Data.Text as T
+
 import           P
 
 import           Regiment.Data
@@ -18,16 +25,19 @@ import           Regiment.Parse
 import           Regiment.Vanguard.Base
 import           Regiment.Vanguard.IO
 
-import           System.Directory (getDirectoryContents)
+import qualified System.Directory as SD
 import           System.FilePath ((</>))
-import           System.IO (IO, stdout, IOMode (..), FilePath, Handle, hClose, openBinaryFile)
+import           System.IO (IO, stdout, IOMode (..), FilePath, Handle, hClose, openBinaryFile, hFlush)
 import           System.IO.Temp (withSystemTempDirectory)
 
-import           X.Control.Monad.Trans.Either (EitherT, newEitherT, mapEitherT, runEitherT, firstEitherT)
+import           X.Control.Monad.Trans.Either (EitherT, newEitherT, mapEitherT, left)
+import           X.Control.Monad.Trans.Either (runEitherT, firstEitherT, tryEitherT)
 
 data RegimentIOError =
     RegimentIOParseError RegimentParseError
   | RegimentIOMergeError (RegimentMergeError RegimentMergeIOError)
+  | RegimentIOCreateDirectoryError TempDirectory Text
+  | RegimentIOTmpDirExistsError TempDirectory
   deriving (Eq, Show)
 
 renderRegimentIOError :: RegimentIOError -> Text
@@ -37,6 +47,15 @@ renderRegimentIOError err =
       renderRegimentParseError e
     RegimentIOMergeError e ->
       renderRegimentMergeError renderRegimentMergeIOError e
+    RegimentIOCreateDirectoryError (TempDirectory tmp) t ->
+         "Failed to create directory "
+      <> T.pack tmp
+      <> "Error: " <> t
+    RegimentIOTmpDirExistsError (TempDirectory tmp) ->
+         "Failed to create directory "
+      <> T.pack tmp
+      <> "\n"
+      <> "Target directory should not exist, it will be created and filled."
 
 regiment ::
      InputFile
@@ -49,6 +68,24 @@ regiment ::
   -> MemoryLimit
   -> EitherT RegimentIOError IO ()
 regiment inn out sc f n nc sep m = do
+  firstEitherT RegimentIOParseError . newEitherT $
+    withSystemTempDirectory "regiment." $ \tmp ->
+      R.runResourceT . runEitherT $ do
+        mapEitherT liftIO $ split inn (TempDirectory tmp) sc f n nc sep m
+        handles <- openDir (TempDirectory tmp)
+        mapEitherT liftIO . firstT RegimentParseMergeError $ merge handles out
+
+split ::
+     InputFile
+  -> TempDirectory
+  -> [SortColumn]
+  -> FormatKind
+  -> Newline
+  -> NumColumns
+  -> Separator
+  -> MemoryLimit
+  -> EitherT RegimentParseError IO ()
+split inn tmp sc f n nc sep m = do
   let
     fmt =
       Format {
@@ -57,26 +94,47 @@ regiment inn out sc f n nc sep m = do
       , formatColumnCount = numColumns nc
       , formatKind = f
       }
-  firstEitherT RegimentIOParseError . newEitherT $
-    withSystemTempDirectory "regiment." $ \tmp ->
-      runEitherT $ do
-        toTempFiles inn (TempDirectory tmp) fmt sc m
-        firstT RegimentParseMergeError $ merge (TempDirectory tmp) out
+  toTempFiles inn tmp fmt sc m
+
+mergeDirs ::
+     [TempDirectory]
+  -> Maybe OutputFile
+  -> EitherT RegimentIOError IO ()
+mergeDirs dirs out = firstT RegimentIOMergeError $
+  mapEitherT R.runResourceT $ do
+    handles <- mapM openDir dirs
+    mapEitherT liftIO $ merge (concat handles) out
 
 merge ::
-     TempDirectory
+     [Handle]
   -> Maybe OutputFile
   -> EitherT (RegimentMergeError RegimentMergeIOError) IO ()
-merge (TempDirectory tmp) out = mapEitherT R.runResourceT  $ do
-  fs <- liftIO $ fmap (filter (flip notElem [".", ".."])) $ getDirectoryContents tmp
-  handles <- mapM (open ReadMode) $ fmap (tmp </>) fs
+merge handles out = mapEitherT R.runResourceT $ do
   v <- mapEitherT liftIO $ formVanguardIO handles
   out' <- case out of
             Just (OutputFile o) -> open WriteMode o
             Nothing -> return stdout
   mapEitherT liftIO $ runVanguardIO v out'
+  liftIO $ hFlush out'
+
+openDir :: MonadResource m => TempDirectory -> m [Handle]
+openDir (TempDirectory tmp) = do
+  fs <- liftIO $ fmap (filter (flip notElem [".", ".."])) $ SD.getDirectoryContents tmp
+  handles <- mapM (open ReadMode) $ fmap (tmp </>) fs
+  return handles
 
 open :: MonadResource m => IOMode -> FilePath -> m Handle
 open m f = do
   snd <$> R.allocate (openBinaryFile f m) hClose
 
+createDirectory :: (MonadIO m, MonadCatch m) => TempDirectory -> EitherT RegimentIOError m ()
+createDirectory t@(TempDirectory tmp) = do
+  exists <- liftIO $ SD.doesDirectoryExist tmp
+  if exists
+    then
+      left $ RegimentIOTmpDirExistsError t
+    else
+      tryEitherT handler . liftIO $ SD.createDirectoryIfMissing True tmp
+  where
+    handler :: SomeException -> RegimentIOError
+    handler e = RegimentIOCreateDirectoryError t $ T.pack (show e)
diff --git a/src/Regiment/Parse.hs b/src/Regiment/Parse.hs
index 2d2f991..3439926 100644
--- a/src/Regiment/Parse.hs
+++ b/src/Regiment/Parse.hs
@@ -25,6 +25,7 @@ import qualified Data.Vector.Algorithms.Tim as Tim
 import           P
 
 import qualified Parsley.Xsv.Parser as Parsley
+import qualified Parsley.Xsv.Render as Parsley
 
 import           Regiment.Data
 import           Regiment.Serial
@@ -93,8 +94,7 @@ toTempFiles (InputFile inn) tmpDir f sc (MemoryLimit cap) = do
                   >> go counter (drops + 1) partNum memCounter rest)
               (Parsley.Success $ \rest fields ->
                 let
-                  bytesParsed = BS.take (BS.length bytes - BS.length rest) bytes
-                  sko = selectSortKeys bytesParsed (Parsley.getFields fields) sc
+                  sko = selectSortKeys fields f sc
                 in
                   case sko of
                     Left _ ->
@@ -116,20 +116,22 @@ toTempFiles (InputFile inn) tmpDir f sc (MemoryLimit cap) = do
     liftIO (BS.hGetSome h innChunkSize) >>= go (0 :: Int) (0 :: Int) (0 :: Int) (0 :: Int)
 
 selectSortKeys ::
-     BS.ByteString
-  -> (Boxed.Vector BS.ByteString)
+     Parsley.Fields
+  -> Format
   -> [SortColumn]
   -> Either RegimentParseError (Boxed.Vector BS.ByteString)
-selectSortKeys bytes parsed sortColumns =
+selectSortKeys fields fmt sortColumns =
   let
+    parsed = Parsley.getFields fields
+    unparsed = Parsley.renderRow fmt parsed <> (Parsley.renderNewline $ formatNewline fmt)
     maybeSortkeys = L.map (\sc -> parsed Boxed.!? (sortColumn sc)) sortColumns
     ks = DM.catMaybes maybeSortkeys
     keyNotFound = and $ L.map isNothing maybeSortkeys
-  in do
+  in
     case keyNotFound of
       True -> Left RegimentParseKeyNotFound
       -- returns a vector consisting of keys and payload
-      False -> Right $ (Boxed.fromList ks) Boxed.++ (Boxed.singleton bytes)
+      False -> Right $ (Boxed.fromList ks) Boxed.++ (Boxed.singleton unparsed)
 
 flushVector ::
      Grow.Grow Boxed.MVector (PrimState IO) (Boxed.Vector BS.ByteString)
@@ -157,7 +159,7 @@ writeChunk h vs =
       kp <- hoistEither $ vecToKP bs
       liftIO $ writeCursor h kp
       if Boxed.null tl
-        then return ()
+        then liftIO $ IO.hFlush h >> return ()
         else writeChunk h tl
 
 writeCursor :: IO.Handle -> KeyedPayload -> IO ()
diff --git a/test/Test/IO/Regiment/IO.hs b/test/Test/IO/Regiment/IO.hs
index 107e505..9fe4326 100644
--- a/test/Test/IO/Regiment/IO.hs
+++ b/test/Test/IO/Regiment/IO.hs
@@ -29,11 +29,10 @@ import           Regiment.Parse
 import           Regiment.Serial
 import           Regiment.Vanguard.IO
 
-import           System.Directory (getDirectoryContents)
 import           System.Exit
-import           System.FilePath ((</>))
+import           System.FilePath ((</>), takeBaseName)
 import           System.IO (FilePath, IO, Handle, IOMode (..), hIsEOF, hClose, withBinaryFile)
-import           System.IO.Temp (withTempFile, withTempDirectory)
+import           System.IO.Temp (withTempDirectory, openTempFile)
 import           System.Process (readProcessWithExitCode, createProcess, proc, waitForProcess, env)
 
 import           Test.Parsley.Arbitrary
@@ -41,7 +40,7 @@ import           Test.Regiment.Arbitrary
 
 import           Test.QuickCheck.Instances ()
 import           Test.QuickCheck.Jack (suchThat, property, forAllProperties, quickCheckWithResult)
-import           Test.QuickCheck.Jack (vectorOf, maxSuccess, stdArgs, counterexample, (===), mkJack_)
+import           Test.QuickCheck.Jack (listOfN, maxSuccess, stdArgs, counterexample, (===), mkJack_)
 
 import           X.Control.Monad.Trans.Either (EitherT, newEitherT, runEitherT)
 
@@ -78,81 +77,112 @@ prop_roundtrip_write_read_line =
 
           return $ result === expected
 
-prop_roundtrip_write_read_sorted_tmp_file =
-  gamble (arbitrary `suchThat` (> 0)) $ \n ->
-  gamble (arbitrary `suchThat` (\f -> (formatColumnCount f) > 0)) $ \fmt ->
-  gamble (genListSortColumns fmt) $ \sc ->
-  gamble (vectorOf n $ genRealKP fmt sc) $ \kps ->
-    testIO . withTempFile "dist" "test-input-" $ \tmpFile hFile -> do
-      -- write rs to InputFile
-      -- note that our payloads are terminated by newlines so no need to
-      -- add terminating newline
-      BS.hPut hFile $ BS.concat (payload <$> kps)
-      hClose hFile
-      -- sort the rs in memory
-      let expected = payload <$> DL.sort kps
-
-      -- toTempFiles on the input file that was just created
-      withTempDirectory "dist" "regiment-test" $ \tmp -> do
-        success <- runEitherT $ toTempFiles (InputFile tmpFile) (TempDirectory tmp) fmt sc (MemoryLimit (1024 * 1024))
-        case success of
-          Left e ->
-            return $ counterexample ("toTempFiles errored out: " <> show e) False
-          Right _ -> do
-            -- read the contents of the sorted temp files
-            tmpFilePaths <- fmap (filter (flip notElem [".", ".."])) $ getDirectoryContents tmp
-            mresult <- runEitherT . readPayloads $ fmap (tmp </>) tmpFilePaths
-            return $ case mresult of
-              Left _ -> counterexample "RegimentIOError" False
-              Right result -> expected === result
-
 prop_regiment =
-  gamble (arbitrary `suchThat` (> 0)) $ \n ->
-  gamble genNonNullSeparator $ \sep ->
-  gamble (genRestrictedFormat sep) $ \fmt ->
-  gamble (genListSortColumns fmt) $ \sc ->
-  gamble (vectorOf n $ (mkJack_ $ genRow fmt) `suchThat` (not . BS.null)) $ \rs -> do
-    let inp = unlines fmt rs
-    testIO . withTempFile "dist" "test-input-" $ \tmpFile hFile -> do
-      BS.hPut hFile inp
-      hClose hFile
-
-      withTempDirectory "dist" "regiment-test." $ \tmp -> do
-        success <- runEitherT $ regiment (InputFile tmpFile)
-                                         (Just . OutputFile $ tmp </> "regiment-sorted")
-                                         sc
-                                         (formatKind fmt)
-                                         (formatNewline fmt)
-                                         (NumColumns (formatColumnCount fmt))
-                                         (formatSeparator fmt)
-                                         (MemoryLimit (1024 * 1024))
-        case success of
-          Left e ->
-            return $ counterexample ("regiment errored out: " <> show e) False
-          Right _ -> do
-            let
-              -- below is to end up with key options for sort so that
-              -- e.g. sort cols of 2, 4 correspond to
-              -- "-k", "2,2", "-k", "4,4"
-              sc' = ((\k -> k + 1) . sortColumn) <$> sc
-              scs = [fmap (\s -> DL.filter (\c -> c /= '(' && c /= ')') s) $ show <$> (DL.zip sc' sc')]
-              ks = DL.concat . DL.transpose $ [DL.replicate (DL.length sc) (T.unpack "-k")] DL.++ scs
-              sepChar = BSC.unpack . BS.singleton . renderSeparator $ formatSeparator fmt
-
-            (_, _, _, pr) <- createProcess ( proc "sort"
-                                           $ ks <> ["-t", sepChar, "-o", tmp </> "gnu-sorted", tmpFile]
-                                           ) { env = Just [("LC_COLLATE", "C")] }
-            ex <- waitForProcess pr
-            case ex of
-              ExitFailure _ -> return $ counterexample "gnu-sort errorred out" False
-              ExitSuccess -> do
-                (ex', so', _se') <- readProcessWithExitCode "diff" [ "-c"
-                                                                     , tmp </> "regiment-sorted"
-                                                                     , tmp </> "gnu-sorted"
-                                                                   ] ""
-                case ex' of
-                  ExitSuccess -> return $ property True
-                  ExitFailure _ -> return $ counterexample ("diff failed: " <> so') False
+  gamble genRestrictedFormat $ \fmt ->
+  gamble (genSortColumns fmt) $ \sc ->
+  gamble (listOfN 1 100 $ (mkJack_ $ genRow fmt) `suchThat` (not . BS.null)) $ \rs -> testIO $ do
+    withTempDirectory "dist" "regiment-test." $ \tmp -> do
+      tmpFile <- writeToTmpFile tmp fmt rs
+      success <- runEitherT $ regiment (InputFile tmpFile)
+                                       (Just . OutputFile $ tmp </> "regiment-sorted")
+                                       sc
+                                       (formatKind fmt)
+                                       (formatNewline fmt)
+                                       (NumColumns (formatColumnCount fmt))
+                                       (formatSeparator fmt)
+                                       (MemoryLimit (1024 * 1024))
+      case success of
+        Left e ->
+          return $ counterexample ("regiment errored out: " <> show e) False
+        Right _ -> do
+          let
+            -- below is to end up with key options for sort so that
+            -- e.g. sort cols of 2, 4 correspond to
+            -- "-k", "2,2", "-k", "4,4"
+            sc' = ((\k -> k + 1) . sortColumn) <$> sc
+            scs = [fmap (\s -> DL.filter (\c -> c /= '(' && c /= ')') s) $ show <$> (DL.zip sc' sc')]
+            ks = DL.concat . DL.transpose $ [DL.replicate (DL.length sc) (T.unpack "-k")] DL.++ scs
+            sepChar = BSC.unpack . BS.singleton . renderSeparator $ formatSeparator fmt
+
+          (_, _, _, pr) <- createProcess ( proc "sort"
+                                         $ ks <> ["-t", sepChar, "-o", tmp </> "gnu-sorted", tmpFile]
+                                         ) { env = Just [("LC_COLLATE", "C")] }
+          ex <- waitForProcess pr
+          case ex of
+            ExitFailure _ -> return $ counterexample "gnu-sort errorred out" False
+            ExitSuccess -> do
+              (ex', so', _se') <- readProcessWithExitCode "diff" [ "-c"
+                                                                   , tmp </> "regiment-sorted"
+                                                                   , tmp </> "gnu-sorted"
+                                                                 ] ""
+              case ex' of
+                ExitSuccess -> return $ property True
+                ExitFailure _ -> return $ counterexample ("diff failed: " <> so') False
+
+prop_regiment_split_merge =
+  gamble genFormat $ \fmt ->
+  gamble (listOfN 1 5 $ listOfN 1 100 $ (mkJack_ $ genRow fmt) `suchThat` (not . BS.null)) $ \vrs -> testIO $ do
+    -- sort on all columns to avoid having to deal with differences
+    -- in ordering of rows when sort keys are the same.
+    let sc = SortColumn <$> [0 .. ((formatColumnCount fmt) - 1)]
+    withTempDirectory "dist" "regiment-test." $ \tmp -> do
+      chunkFiles <- mapM (writeToTmpFile tmp fmt) vrs
+      inn <- writeToTmpFile tmp fmt $ concat vrs
+      success <- runEitherT $
+        regiment (InputFile inn)
+                 (Just . OutputFile $ tmp </> "regiment-sorted")
+                 sc
+                 (formatKind fmt)
+                 (formatNewline fmt)
+                 (NumColumns (formatColumnCount fmt))
+                 (formatSeparator fmt)
+                 (MemoryLimit (1024 * 1024))
+
+      case success of
+        Left e ->
+          return $ counterexample ("regiment errored out: " <> show e) False
+        Right _ -> do
+          splitSuccess <-
+            mapM (\nm -> runEitherT $ do
+                   let tmpDir = TempDirectory $ tmp </> "splits" </> (takeBaseName nm)
+                   createDirectory tmpDir
+                   firstT RegimentIOParseError $
+                     split
+                      (InputFile nm)
+                      tmpDir
+                      sc
+                      (formatKind fmt)
+                      (formatNewline fmt)
+                      (NumColumns (formatColumnCount fmt))
+                      (formatSeparator fmt)
+                      (MemoryLimit (1024 * 1024))) chunkFiles
+          case null $ lefts splitSuccess of
+            False ->
+              return $ counterexample ("split errored out " <> show splitSuccess) False
+            True -> do
+              mergeSuccess <-
+                runEitherT $
+                  mergeDirs (fmap (\nm -> TempDirectory $ tmp </> "splits" </> (takeBaseName nm)) chunkFiles)
+                            (Just . OutputFile $ tmp </> "regiment-split-merge")
+              case mergeSuccess of
+                Left e ->
+                  return $ counterexample ("merge errored out: " <> show e) False
+                Right _ -> do
+                  (ex, so, _se) <- readProcessWithExitCode "diff" [ "-c"
+                                                                  , tmp </> "regiment-sorted"
+                                                                  , tmp </> "regiment-split-merge"
+                                                                  ] ""
+                  case ex of
+                    ExitSuccess -> return $ property True
+                    ExitFailure _ -> return $ counterexample ("diff failed: " <> so) False
+
+writeToTmpFile :: FilePath -> Format -> [BS.ByteString] -> IO FilePath
+writeToTmpFile tmp fmt vrs = do
+    let inp = unlines fmt vrs
+    (tmpFile, hFile) <- openTempFile tmp "test-input-"
+    BS.hPut hFile inp
+    hClose hFile
+    return tmpFile
 
 unlines fmt ls =
   let nl = renderNewline . formatNewline $ fmt
diff --git a/test/Test/Regiment/Arbitrary.hs b/test/Test/Regiment/Arbitrary.hs
index 9b2cd50..8307e0b 100644
--- a/test/Test/Regiment/Arbitrary.hs
+++ b/test/Test/Regiment/Arbitrary.hs
@@ -40,6 +40,35 @@ genKP numKeys =
     <$> Boxed.fromList <$> (vectorOf numKeys genKey)
     <*> genBytes
 
+genRealKP :: Format -> [SortColumn] -> Jack KeyedPayload
+genRealKP fmt sc = do
+  -- gen KeyedPayload using the gens from Parsley so that
+  -- we end up with an actual delimited or standardized payload
+
+  -- assume that sc is legit - i.e of length < formatColumnCount and
+  -- a subset of 1 .. formatColumnCount
+  fields <- vectorOf (formatColumnCount fmt) (genBytes)
+  let
+    sortkeys = DL.map (\i -> Key $ fields DL.!! (sortColumn i)) sc
+  return $ KeyedPayload {
+      keys = Boxed.fromList sortkeys
+    , payload = Parsley.renderRow fmt (Boxed.fromList fields) <> (renderNewline $ formatNewline fmt)
+  }
+
+genListKPsUniqueKeys :: Jack [[KeyedPayload]]
+genListKPsUniqueKeys = do
+  numKeys <- arbitrary `suchThat` (> 0)
+  numLists <- chooseInt (1,10)
+  maxListLength <- arbitrary `suchThat` (> 0)
+  forM [1 .. numLists] $ \i ->
+    genKPsUniqueKeys i numKeys maxListLength
+
+genListKPsNoPayload :: Jack [[KeyedPayload]]
+genListKPsNoPayload = do
+  numKeys <- arbitrary `suchThat` (> 0)
+  numLists <- chooseInt (1,10)
+  vectorOf numLists $ listOfN 0 numLists (genKPNoPayload numKeys)
+
 genKPNoPayload :: Int -> Jack KeyedPayload
 genKPNoPayload numKeys = do
   bs <- vectorOf numKeys genBytes
@@ -51,12 +80,6 @@ genKPNoPayload numKeys = do
     , payload = p
     }
 
-genListKPsNoPayload :: Jack [[KeyedPayload]]
-genListKPsNoPayload = do
-  numKeys <- arbitrary `suchThat` (> 0)
-  numLists <- chooseInt (1,10)
-  vectorOf numLists $ listOfN 0 numLists (genKPNoPayload numKeys)
-
 genKPsUniqueKeys :: Int -> Int -> Int -> Jack [KeyedPayload]
 genKPsUniqueKeys prefix numKeys maxListLength = do
   kps <- listOfN 0 maxListLength (genKP numKeys)
@@ -71,14 +94,6 @@ genKPsUniqueKeys prefix numKeys maxListLength = do
 
   return $ prepend uniquifier <$> kps
 
-genListKPsUniqueKeys :: Jack [[KeyedPayload]]
-genListKPsUniqueKeys = do
-  numKeys <- arbitrary `suchThat` (> 0)
-  numLists <- chooseInt (1,10)
-  maxListLength <- arbitrary `suchThat` (> 0)
-  forM [1 .. numLists] $ \i ->
-    genKPsUniqueKeys i numKeys maxListLength
-
 genCursor :: Int -> Handle -> Jack (Cursor Handle)
 genCursor n h =
   oneof [
@@ -86,8 +101,8 @@ genCursor n h =
     , NonEmpty <$> return h <*> (genKP n)
   ]
 
-genListSortColumns :: Format -> Jack [SortColumn]
-genListSortColumns fmt = do
+genSortColumns :: Format -> Jack [SortColumn]
+genSortColumns fmt = do
   sortcols <- sublistOf [0 .. ((formatColumnCount fmt) - 1)] `suchThat` (not . null)
   return $ SortColumn <$> sortcols
 
@@ -102,20 +117,6 @@ genField fmt =
       Standardized -> do
         genStandardizedField strBSlistOf1 sep
 
-genRealKP :: Format -> [SortColumn] -> Jack KeyedPayload
-genRealKP fmt sc = do
-  -- gen KeyedPayload using the gens from Parsley so that
-  -- we end up with an actual delimited or standardized payload
-
-  -- assume that sc is legit - i.e of length < formatColumnCount and
-  -- a subset of 1 .. formatColumnCount
-  fields <- vectorOf (formatColumnCount fmt) (genField fmt)
-  let
-    sortkeys = DL.map (\i -> Key $ fields DL.!! (sortColumn i)) sc
-  return $ KeyedPayload {
-      keys = Boxed.fromList sortkeys
-    , payload = Parsley.renderRow fmt (Boxed.fromList fields) <> (renderNewline $ formatNewline fmt)
-  }
 
 strBSlistOf1 :: QC.Gen BS.ByteString
 strBSlistOf1 = fmap BSC.pack . QC.listOf1 . QC.elements $
@@ -125,9 +126,17 @@ genNonNullSeparator :: Jack Separator
 genNonNullSeparator =
   arbitrary `suchThat` (\sep -> sep /= (Separator . fromIntegral $ ord '\NUL'))
 
+genFormat :: Jack Format
+genFormat = do
+  sep <- genNonNullSeparator
+  arbitrary `suchThat` (\fmt ->
+                          (formatColumnCount fmt) > 0
+                       && (formatSeparator fmt) == sep
+                       )
 
-genRestrictedFormat :: Separator -> Jack Format
-genRestrictedFormat sep =
+genRestrictedFormat :: Jack Format
+genRestrictedFormat = do
+  sep <- genNonNullSeparator
   arbitrary `suchThat` (\fmt ->
                           (formatColumnCount fmt) > 0
                        && (formatKind fmt) == Delimited
diff --git a/test/Test/Regiment/Vanguard/List.hs b/test/Test/Regiment/Vanguard/List.hs
index de86224..e218cee 100644
--- a/test/Test/Regiment/Vanguard/List.hs
+++ b/test/Test/Regiment/Vanguard/List.hs
@@ -37,6 +37,11 @@ prop_runVanguard_unique_keys =
 
 prop_runVanguard_possible_dupe_sortkeys :: Property
 prop_runVanguard_possible_dupe_sortkeys =
+  -- by artificially removing payloads we are able to avoid
+  -- test that runVanguard is the same as DL.sort. Retaining
+  -- payloads means that we have to accomodate differences in
+  -- the ordering of payloads between runVanguard and DL.sort
+  -- when sort keys are exactly the same.
   runVanguardOn genListKPsNoPayload
 
 return []
diff --git a/test/cli/basic-usage/run b/test/cli/basic-usage/run
index dba81df..3b79c99 100755
--- a/test/cli/basic-usage/run
+++ b/test/cli/basic-usage/run
@@ -52,3 +52,38 @@ $REGIMENT sort -k 5 -c 15 -f ',' -o "path/to/output-file" input-file
 
 # all the things
 $REGIMENT sort -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -o "path/to/output-file" input-file
+
+# split
+$REGIMENT split --key 5 --number-columns 15 --field-separator ','  --dir "path/to/output-dir" input-file 
+$REGIMENT split -k 5 -c 15 -f ',' -d "path/to/output-dir" input-file
+
+# specify multiple columns to sort on
+$REGIMENT split --key 1 --key 5 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file
+$REGIMENT split -k 1 -k 5 -c 15 -f ',' -d "path/to/output-dir" input-file
+
+# explicitly specify that format of input-file is standardized
+$REGIMENT split --standardized --key 1 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file
+$REGIMENT split --standardized -k 1 -c 15 -f ',' -d "path/to/output-dir" input-file
+
+# explicitly specify memory upper bound of 2 GB (default unit is MB - use G to specify GB)
+$REGIMENT split --mem-max 2G --key 1 --number-columns 15 --field-separator ',' --dir "path/to/output-dir" input-file
+$REGIMENT split -m 2G -k 1 -c 15 -f ',' -d "path/to/output-dir" input-file
+$REGIMENT split -m 2000 -k 1 -c 15 -f ',' -d "path/to/output-dir" input-file
+
+# explicitly specify newline -- one of LF, CR or CRLF (defaults to LF)
+$REGIMENT split --key 5 --number-columns 15 --field-separator ',' --crlf --dir "path/to/output-dir" input-file
+$REGIMENT split -k 5 -c 15 -f ',' --crlf --dir "path/to/output-dir" input-file
+
+# all the things
+$REGIMENT split -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -d "/path/to/output-dir" input-file
+
+# merge-tmps
+
+# write to stdout
+$REGIMENT merge-tmps "/path/to/dir1" "/path/to/dir2"
+
+# specify output file
+$REGIMENT merge-tmps --output "path/to/output-file" "/path/to/dir1" "/path/to/dir2"
+$REGIMENT merge-tmps -o "path/to/output-file" "/path/to/dir1" "/path/to/dir2"
+
+