From 9fa8b76e5bc72404efa4d0d074f4e62155575aae Mon Sep 17 00:00:00 2001 From: Sharif Olorin Date: Mon, 18 Jul 2016 08:08:23 +0000 Subject: [PATCH] Test and benchmark for date field check --- ambiata-warden.cabal | 8 +++-- bench/bench.hs | 17 ++++++++- cbits/field.c | 6 ++-- test/Test/Warden/Arbitrary.hs | 60 +++++++++++++++++++++++++++++++- test/Test/Warden/Parser/Field.hs | 10 ++++++ 5 files changed, 95 insertions(+), 6 deletions(-) diff --git a/ambiata-warden.cabal b/ambiata-warden.cabal index bace835..2f9b14b 100644 --- a/ambiata-warden.cabal +++ b/ambiata-warden.cabal @@ -186,8 +186,9 @@ executable warden-gen , resourcet == 1.1.* , semigroups , temporary == 1.2.* - , transformers >= 0.3 && < 5 , text + , time == 1.5.* + , transformers >= 0.3 && < 5 , unix >= 2.7.1 && < 2.7.3 , vector == 0.10.* @@ -223,10 +224,11 @@ test-suite test , filepath == 1.3.* , ieee754 == 0.7.* , lens == 4.9.* - , semigroups , quickcheck-instances == 0.3.* + , semigroups , temporary == 1.2.* , text + , time == 1.5.* , vector == 0.10.* test-suite test-io @@ -273,6 +275,7 @@ test-suite test-io , semigroups , temporary , text == 1.2.* + , time == 1.5.* , transformers >= 0.3 && < 5 , unix >= 2.7.1 && < 2.7.3 , vector == 0.10.* @@ -319,6 +322,7 @@ benchmark bench , semigroups , temporary , text == 1.2.* + , time == 1.5.* , transformers >= 0.3 && < 5 , unix >= 2.7.1 && < 2.7.3 , vector == 0.10.* diff --git a/bench/bench.hs b/bench/bench.hs index 1946b3e..daf6302 100644 --- a/bench/bench.hs +++ b/bench/bench.hs @@ -105,6 +105,12 @@ prepareBools = fmap (fmap T.encodeUtf8) . generate' (Deterministic 555) (GenSize prepareNonBools :: IO [ByteString] prepareNonBools = fmap (fmap T.encodeUtf8) . generate' (Deterministic 666) (GenSize 100) $ vectorOf 100 renderedNonBool +prepareDates :: IO [ByteString] +prepareDates = generate' (Deterministic 555) (GenSize 100) $ vectorOf 100 renderedDate + +prepareNonDates :: IO [ByteString] +prepareNonDates = generate' (Deterministic 666) (GenSize 100) $ vectorOf 100 renderedNonDate + benchABDecode :: FileFormat -> NonEmpty ViewFile -> IO () benchABDecode ff vfs = let sep = Separator . fromIntegral $ ord '|' @@ -150,6 +156,9 @@ benchToRow = toRow . Right benchCheckFieldBool :: [ByteString] -> [Bool] benchCheckFieldBool = fmap checkFieldBool +benchCheckFieldDate :: [ByteString] -> [Bool] +benchCheckFieldDate = fmap checkFieldDate + main :: IO () main = do withTempDirectory "." "warden-bench-" $ \root -> @@ -160,11 +169,17 @@ main = do , bench "decode/delimited-text/1000" $ nfIO (benchABDecode DelimitedText vfs) , bench "decode/toRow/100" $ nf benchToRow bss ] - , env ((,,) <$> prepareRow <*> prepareBools <*> prepareNonBools) $ \ ~(rs, bools, nonbools) -> + , env ((,,,,) <$> prepareRow + <*> prepareBools + <*> prepareNonBools + <*> prepareDates + <*> prepareNonDates) $ \ ~(rs, bools, nonbools, dates, nondates) -> bgroup "field-parsing" $ [ bench "parseField/200" $ nf benchFieldParse rs , bench "checkFieldBool/boolean/100" $ nf benchCheckFieldBool bools , bench "checkFieldBool/non-boolean/100" $ nf benchCheckFieldBool nonbools + , bench "checkFieldDate/date/100" $ nf benchCheckFieldDate dates + , bench "checkFieldDate/non-date/100" $ nf benchCheckFieldDate nondates ] , env prepareFolds $ \ ~(rs, ts, piis, nonPiis, bs100, bs10) -> bgroup "folds" $ [ diff --git a/cbits/field.c b/cbits/field.c index f0a6c59..b98a5b1 100644 --- a/cbits/field.c +++ b/cbits/field.c @@ -127,7 +127,7 @@ static inline bool is_separator(char c) { precondition: n >= 8 */ static inline bool match_ymd(char *buf, size_t n) { - /* 0xc0 = 0x80 & 0x40 - if these bits are set, the byte is too + /* 0xc0 = 0x80 | 0x40 - if these bits are set, the byte is too high to be a digit or a separator. */ static const int64_t ymd_mask = 0xc0c0c0c0c0c0ffff; @@ -138,7 +138,7 @@ static inline bool match_ymd(char *buf, size_t n) { /* First, we drop everything which doesn't start with '20' and have eight bytes compatible with a YYYYxMMxDD format. */ - if (!((*p & ymd_mask) == ymd_bits) && is_digit(buf[2]) && is_digit(buf[3])) { + if (!(((*p & ymd_mask) == ymd_bits) && is_digit(buf[2]) && is_digit(buf[3]))) { return FALSE; } @@ -166,6 +166,8 @@ static inline bool match_ymd(char *buf, size_t n) { Currently checks: - Fields beginning with big-endian dates. + + FIXME: more supported date formats */ bool warden_field_datetime(char *buf, size_t n) { /* The shortest thing we're willing to call a "date" at this diff --git a/test/Test/Warden/Arbitrary.hs b/test/Test/Warden/Arbitrary.hs index 48738c9..432a265 100644 --- a/test/Test/Warden/Arbitrary.hs +++ b/test/Test/Warden/Arbitrary.hs @@ -11,12 +11,16 @@ import qualified Data.ByteString.Lazy as BL import Data.Char import Data.Csv import qualified Data.Set as S -import Data.List (nub) +import Data.List ((\\), nub) import qualified Data.List as L import Data.List.NonEmpty (NonEmpty) import qualified Data.List.NonEmpty as NE +import Data.String (String) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8, decodeUtf8', encodeUtf8) +import Data.Time.Calendar (Day(..)) +import Data.Time.Clock (UTCTime(..), secondsToDiffTime) +import Data.Time.Format (defaultTimeLocale, formatTime) import qualified Data.Vector as V import qualified Data.Vector.Unboxed as VU import Data.Word @@ -36,6 +40,7 @@ import Test.Delorean.Arbitrary () import Test.QuickCheck (Small(..), NonNegative(..)) import Test.QuickCheck (Arbitrary, Gen, elements, choose, listOf, listOf1) import Test.QuickCheck (vectorOf, arbitrary, suchThat, oneof, sized) +import Test.QuickCheck (frequency) import Test.QuickCheck.Instances () import Text.Printf (printf) @@ -792,3 +797,56 @@ instance Arbitrary PIICheckType where pure NoPIIChecks , PIIChecks <$> arbitrary ] + +genSensibleDateTime :: Gen UTCTime +genSensibleDateTime = do + -- sometime this century + days <- ModifiedJulianDay <$> choose (142 * 365, 192 * 365) + secs <- secondsToDiffTime <$> choose (0, 86401) + pure $ UTCTime days secs + +renderedDate :: Gen BS.ByteString +renderedDate = do + dt <- genSensibleDateTime + render <- frequency [(1, renderDateNoSeparator), (9, renderDateSeparator)] + pure . BSC.pack $ render dt + +renderDateNoSeparator :: Gen (UTCTime -> String) +renderDateNoSeparator = do + tp <- timePart + pure (formatTime defaultTimeLocale ("%Y%m%d" <> tp)) + +renderDateSeparator :: Gen (UTCTime -> String) +renderDateSeparator = do + tp <- timePart + sep <- elements ["-", ".", "/"] + pure (formatTime defaultTimeLocale ((concat ["%Y", sep, "%m", sep, "%d"]) <> tp)) + +timePart :: Gen String +timePart = elements [ + "" + , "%H%M" + , "%H:%M" + , "%H%M%S" + , "%H:%M:%S" + , "T%H:%M:%SZ" + ] + + +renderedNonDate :: Gen BS.ByteString +renderedNonDate = do + x <- fmap BS.pack $ listOf1 noDigits + dt <- renderedDate + oneof [ + pure $ x <> dt + , reversed + , encodeUtf8 <$> elements muppets + ] + where + reversed = do + dt <- genSensibleDateTime + render <- renderDateSeparator + pure . BSC.pack . reverse $ render dt + + noDigits = + elements $ [0x00..0xff] \\ [0x30..0x39] diff --git a/test/Test/Warden/Parser/Field.hs b/test/Test/Warden/Parser/Field.hs index 92560b4..909dc04 100644 --- a/test/Test/Warden/Parser/Field.hs +++ b/test/Test/Warden/Parser/Field.hs @@ -78,6 +78,16 @@ prop_numericFieldP_neg = forAll (elements muppets) $ \t -> let r = parseOnly numericFieldP $ T.encodeUtf8 t in isLeft r === True +prop_checkFieldDate_pos :: Property +prop_checkFieldDate_pos = forAll renderedDate $ \bs -> + let r = checkFieldDate bs in + r === True + +prop_checkFieldDate_neg :: Property +prop_checkFieldDate_neg = forAll renderedNonDate $ \bs -> + let r = checkFieldDate bs in + r === False + return [] tests :: IO Bool tests = $forAllProperties $ quickCheckWithResult (stdArgs { maxSuccess = 1000 })