forked from MarcoSero/Norvigs-Spelling-Corrector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Spelling.hs
66 lines (53 loc) · 2.27 KB
/
Spelling.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
module Spelling (TrainingDict, nWords, correct) where
import qualified Data.ByteString.Char8 as B
import Data.Char (isAlpha, toLower)
import Data.List (foldl', sortBy)
import qualified Data.Map.Strict as M
import Data.Ord (comparing)
import qualified Data.Set as S
import Paths_Norvigs_Spelling_Corrector (getDataFileName)
type WordSet = S.Set String
type TrainingDict = M.Map String Int
alphabet :: String
alphabet = ['a' .. 'z']
nWords :: IO TrainingDict
nWords = do
ws <- getDataFileName "big.txt" >>= B.readFile
return (train . lowerWords . B.unpack $ ws)
lowerWords :: String -> [String]
lowerWords = words . map normalize
where normalize c = if isAlpha c then toLower c else ' '
train :: [String] -> TrainingDict
train = foldl' (\acc x -> M.insertWith (+) x 1 acc) M.empty
edits1 :: String -> WordSet
edits1 w = S.fromList $ deletes ++ transposes ++ replaces ++ inserts
where
splits = [ splitAt n w | n <- [0 .. length w - 1] ]
deletes = map (\(a, b) -> a ++ tail b) splits
transposes = [ a ++ [b1, b0] ++ bs
| (a, b0:b1:bs) <- splits ]
replaces = [ as ++ [c] ++ bs
| (as, _:bs) <- splits, c <- alphabet]
inserts = [ a ++ [c] ++ b
| (a,b) <- splits, c <- alphabet]
edits2 :: String -> WordSet
edits2 = S.unions . S.toList . S.map edits1 . edits1
knownEdits2 :: String -> TrainingDict -> WordSet
knownEdits2 w nwords = edits2 w `S.intersection` M.keysSet nwords
known :: WordSet -> TrainingDict -> WordSet
known inputSet nwords = inputSet `S.intersection` M.keysSet nwords
choices :: String -> TrainingDict -> WordSet
choices w ws = foldr orNextIfEmpty (S.singleton w)
[ known (S.singleton w) ws
, known (edits1 w) ws
, knownEdits2 w ws
]
where orNextIfEmpty x y = if S.null x then y else x
chooseBest :: WordSet -> TrainingDict -> String
chooseBest ch ws = chooseBest' $
ws `M.intersection` M.fromList (map (\x -> (x, ())) (S.toList ch))
where
chooseBest' bestChs = head (map fst (sortCandidates bestChs))
sortCandidates = sortBy (comparing snd) . M.toList
correct :: TrainingDict -> String -> String
correct ws w = chooseBest (choices w ws) ws