From 731edc94955c5a583b96c8373350998a921f3644 Mon Sep 17 00:00:00 2001 From: Navin Keswani Date: Thu, 23 Mar 2017 21:28:27 +1100 Subject: [PATCH] Performance notes and minor fixes. --- README.md | 2 ++ doc/performance-notes.md | 30 ++++++++++++++++++++++++++++++ main/regiment.hs | 6 +++--- src/Regiment/Parse.hs | 2 +- 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 doc/performance-notes.md diff --git a/README.md b/README.md index 6d906f7..3dd0ac8 100644 --- a/README.md +++ b/README.md @@ -47,3 +47,5 @@ regiment sort -k 5 -c 15 -f ',' -o "path/to/output-file" input-file # all the things regiment sort -f ',' -k 1 -k 4 -k 5 -c 26 -m 10G --crlf --standardized -o "path/to/output-file" input-file ``` + +Note: `regiment` requires local storage roughly equivalent to the size of the inputs, and follows unix `TMPDIR` conventions for that storage. diff --git a/doc/performance-notes.md b/doc/performance-notes.md new file mode 100644 index 0000000..0a3d7dd --- /dev/null +++ b/doc/performance-notes.md @@ -0,0 +1,30 @@ +### Notes on performance + +20170323 - with no performance tuning, at its inception (around commit `b6da9b7`): + +``` +Sorting an 11GB file (on a Macbook Pro): + +gnu-sort (defaults): LC_COLLATE=C sort -t '|' -k 3,3 -o ~/Downloads/grohl/sort-sauerkraut 314.75s user 67.72s system 96% cpu 6:36.24 total + +gnu-sort (2GB memory allocation): LC_COLLATE=C sort -t '|' -k 3,3 -S 2G -o ~/Downloads/grohl/sort-sauerkraut 346.95s user 34.03s system 97% cpu 6:32.65 total + +regiment (2GB memory allocation): ./dist/build/Regiment/regiment sort -c 4 -k 3 -f '|' -m 2147483648 -o 3283.97s user 481.99s system 95% cpu 1:05:54.34 total +``` + +Results of profiling points clearly to the need to improve `updateMinCursor`: + +``` +COST CENTRE MODULE %time %alloc + +updateMinCursor Regiment.Vanguard.Base 69.3 80.1 +runVanguard Regiment.Vanguard.Base 7.5 9.8 +compare Regiment.Data 6.6 0.0 +flushVector Regiment.Parse 3.3 1.4 +compare Regiment.Data 2.8 0.0 +readKeyedPayloadIO Regiment.Vanguard.IO 1.8 1.2 +writeCursor Regiment.Parse 1.2 1.4 +selectSortKeys Regiment.Parse 1.0 1.1 +``` + + diff --git a/main/regiment.hs b/main/regiment.hs index b6dd674..d40af8d 100644 --- a/main/regiment.hs +++ b/main/regiment.hs @@ -129,21 +129,21 @@ lfP :: Parser Newline lfP = flag' LF . mconcat $ [ long "lf" - , help "The input file uses \n to terminate lines (default)." + , help "The input file uses \\n to terminate lines (default)." ] crP :: Parser Newline crP = flag' CR . mconcat $ [ long "cr" - , help "The input file uses \r to terminate lines." + , help "The input file uses \\r to terminate lines." ] crlfP :: Parser Newline crlfP = flag' CRLF . mconcat $ [ long "crlf" - , help "The input file uses \r\n to terminate lines." + , help "The input file uses \\r\\n to terminate lines." ] toChar :: Text -> Maybe Word8 diff --git a/src/Regiment/Parse.hs b/src/Regiment/Parse.hs index 0e504a0..a60efaa 100644 --- a/src/Regiment/Parse.hs +++ b/src/Regiment/Parse.hs @@ -125,7 +125,7 @@ flushVector :: Grow.Grow Boxed.MVector (PrimState IO) (Boxed.Vector BS.ByteStrin flushVector acc counter (TempDirectory tmp) = do mv <- Grow.unsafeElems acc Tim.sort mv - (v :: Boxed.Vector (Boxed.Vector BS.ByteString)) <- Grow.unsafeFreeze acc + (v :: Boxed.Vector (Boxed.Vector BS.ByteString)) <- Grow.freeze acc -- write to TempFile newEitherT . IO.withFile (tmp (T.unpack $ renderIntegral counter)) WriteMode $ \out -> do runEitherT $ writeChunk out v