-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
71 lines (61 loc) · 3.94 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
POOL ?= 8192
PICK ?= 5
CACHE_DIR = cache
EN_FULL = ${CACHE_DIR}/en_full.txt
WORD_LIST = word_list.txt
LONG_WORD_LIST = long_word_list.txt
all: word_list password
.SECONDARY: ${CACHE_DIR} ${EN_FULL}
password: ${LONG_WORD_LIST}
@head -n ${POOL} long_word_list.txt | ./generate.sh ${PICK}
long_password: ${LONG_WORD_LIST}
@PICK=$${PICK:-10} make -s password
flat_password: ${LONG_WORD_LIST}
@echo $$(make -s password)
word_list: ${WORD_LIST}
${WORD_LIST}: ${LONG_WORD_LIST}
head -n ${POOL} ${LONG_WORD_LIST} > ${WORD_LIST}
# Some gotchas for the next time I edit this... character cleanup turns out to be tricky. This step would like to be just
# stripping frequency counts from the word list, but I ended up doing a lot of cleanup, like stripping accents, and
# fixing lookalike characters.
#
# - `tr` is not UTF-8 aware. `perl` is not by default UTF-8 aware, but has flags that enable it (`-C -Mutf8`). UTF-8
# awareness is required for correct handling of greek inputs.
# - `iconv`'s `//TRANSLIT` behaves differently between linux and os x.
# - OS X produces representations of stripped accent marks. Linux does not. (`é` => `'e` vs `é` => `e`)
# - Linux turns untransliterable characters (e.g. japanese, greek, cyrillic) into question marks '?', and these are not
# dropped from output. (I have tried both `//IGNORE` and `-c`)
# - This is why the `tr -dc '[:alpha:]\n'` pipe stpe lives on the same line--taken together, the output afterwards is
# consistent between linux and os x. All the above nonesense goes away when we drop non-alphabetic characters. (oops,
# not quite all...)
# - There is other nonsense not caught above; e.g. linux `iconv` converts `μ` to `u` (but still not omicron to `o`???).
# OS X does not, so there turn out to be subtle differences, though mostly in the low-frequency side of the word list
# - Due to Makefile's continuation syntax (`\` only continues lines if it's the final character before the newline), I
# can't find a way to comment on individual lines within the pipeline. So, here are line comments
#
# cat ${EN_FULL} \
# | perl -pe 's# \d+$$##' \ strip frequency counts
# | perl -C -Mutf8 -pe 'tr#οηυτνуα#onutvya# # omicron to o, and other greeks' \ fix some lookalike characters
# | (iconv -f UTF-8 -t ASCII//TRANSLIT -c; true) | tr -dc '[:alpha:]\n' \ get rid of non-ascii
# | sed '/^$$/d' \ remove blank lines
# | perl -ne '$$H{$$_}++ or print' \ removes duplicate lines... can't use
# \ `uniq`... that only removes adjacent
# \ duplicates.
# > ${LONG_WORD_LIST}
#
# Given the amount of perl in here, it might be time to just turn this section into a perl script. My one worry would be
# finding an equivalent to `iconv`'s `ASCII//TRANSLIT` within perl's standard libraries.
${LONG_WORD_LIST}: ${EN_FULL}
cat ${EN_FULL} \
| perl -pe 's# \d+$$##' \
| perl -C -Mutf8 -pe 'tr#οηυτνуα#onutvya# # omicron to o, and other greeks' \
| (iconv -f UTF-8 -t ASCII//TRANSLIT -c; true) | tr -dc '[:alpha:]\n' \
| sed '/^$$/d' \
| perl -ne '$$H{$$_}++ or print' \
> ${LONG_WORD_LIST}
${EN_FULL}: ${CACHE_DIR}
curl https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/en/en_full.txt > ${EN_FULL}
${CACHE_DIR}:
mkdir -p ${CACHE_DIR}
clean:
rm -rf ${CACHE_DIR} ${WORD_LIST} ${LONG_WORD_LIST}