Skip to content

Commit

Permalink
fix: edlib now doesn't freeze when size of alphabet is 256.
Browse files Browse the repository at this point in the history
  • Loading branch information
Martinsos committed Aug 20, 2021
1 parent 0d2f7f7 commit ec2310e
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 15 deletions.
2 changes: 1 addition & 1 deletion bindings/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
default: build

.PHONY:
edlib:
edlib: $(shell find ../../edlib)
# create a clean (maybe updated) copy of edlib src
rm -rf edlib && cp -r ../../edlib .

Expand Down
23 changes: 10 additions & 13 deletions bindings/python/edlib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,27 @@ def _map_to_bytes(query, target, additional_equalities):
query_bytes = _map_ascii_string(query)
target_bytes = _map_ascii_string(target)
except NeedsAlphabetMapping:
# Map non-ascii symbols into an ASCII alphabet so it can be used
# in the C++ code
query_vals = set(query)
target_vals = set(target)
input_mapping = {
c: chr(idx)
for idx, c in enumerate(query_vals.union(target_vals))
}
if len(input_mapping) > 256:
# Map elements of alphabet to chars from 0 up to 255, so that Edlib can work with them,
# since C++ Edlib needs chars.
alphabet = set(query).union(set(target))
if len(alphabet) > 256:
raise ValueError(
"query and target combined have more than 256 unique values, "
"this is not supported.")
map_seq = lambda seq: ''.join(input_mapping[x] for x in seq).encode('ascii')
alphabet_to_byte_mapping = {
c: idx.to_bytes(1, byteorder='big') for idx, c in enumerate(alphabet)
}
map_seq = lambda seq: b''.join(alphabet_to_byte_mapping[c] for c in seq)
query_bytes = map_seq(query)
target_bytes = map_seq(target)
if additional_equalities is not None:
additional_equalities = [
(input_mapping[a], input_mapping[b])
(alphabet_to_byte_mapping[a].decode('utf-8'), alphabet_to_byte_mapping[b].decode('utf-8'))
for a, b in additional_equalities
if a in input_mapping and b in input_mapping]
if a in alphabet_to_byte_mapping and b in alphabet_to_byte_mapping]
return query_bytes, target_bytes, additional_equalities



def align(query, target, mode="NW", task="distance", k=-1, additionalEqualities=None):
""" Align query with target using edit distance.
@param {str or bytes or iterable of hashable objects} query, combined with target must have no more
Expand Down
11 changes: 11 additions & 0 deletions bindings/python/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,17 @@
result = edlib.align("telephone", "", mode="SHW")
testFailed = testFailed or (not (result and result["editDistance"] == 9))

# Unicode characters
result = edlib.align("ты милая", "ты гений")
testFailed = testFailed or (not (result and result["editDistance"] == 5 and result["alphabetLength"] == 12))

# Long alphabet.
long_alphabet = ''.join([chr(idx) for idx in range(1, 257)])
long_seq1 = long_alphabet * 3
long_seq2 = long_alphabet + long_alphabet[::-1] + long_alphabet
result = edlib.align(long_seq1, long_seq2)
testFailed = testFailed or (not (result and result["editDistance"] == 256))

if testFailed:
print("Some of the tests failed!")
else:
Expand Down
2 changes: 1 addition & 1 deletion edlib/src/edlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ static inline Word* buildPeq(const int alphabetLength,
Word* Peq = new Word[(alphabetLength + 1) * maxNumBlocks];

// Build Peq (1 is match, 0 is mismatch). NOTE: last column is wildcard(symbol that matches anything) with just 1s
for (unsigned char symbol = 0; symbol <= alphabetLength; symbol++) {
for (int symbol = 0; symbol <= alphabetLength; symbol++) {
for (int b = 0; b < maxNumBlocks; b++) {
if (symbol < alphabetLength) {
Peq[symbol * maxNumBlocks + b] = 0;
Expand Down

0 comments on commit ec2310e

Please sign in to comment.