-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.py
99 lines (83 loc) · 3.17 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# common.py
# common variables and functions
#
# Jiyong Jang, 2012
#
import os
import re
from collections import namedtuple
# global variables
ngram_size = 4
context_line = 10
verbose_mode = False
magic_cookie = None
bloomfilter_size = 2097152
min_mn_ratio = 32
PatchInfo = namedtuple('PatchInfo',\
['file_path', 'file_ext', 'orig_lines', 'norm_lines', 'hash_list'])
SourceInfo = namedtuple('SourceInfo',\
['file_path', 'file_ext', 'orig_lines', 'norm_lines'])
ContextInfo = namedtuple('ContextInfo',\
['source_id', 'prev_context_line', 'start_line', 'end_line', 'next_context_line'])
class FileExt:
NonText = 0
Text = 1
C = 2
Java = 3
ShellScript = 4
Python = 5
Perl = 6
PHP = 7
Ruby = 8
# html escape chracters
html_escape_dict = { '&': '&', '>': '>', '<': '<', '"': '"', '\'': ''' }
# regex for comments
c_regex = re.compile(r'(?P<comment>//.*?$|[{}]+)|(?P<multilinecomment>/\*.*?\*/)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^/\'"{}]*)', re.DOTALL | re.MULTILINE)
c_partial_comment_regex = re.compile(r'(?P<comment>/\*.*?$|^.*?\*/)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^/\'"{}]*)', re.DOTALL)
shellscript_regex = re.compile(r'(?P<comment>#.*?$)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^#\'"]*)', re.DOTALL | re.MULTILINE)
perl_regex = re.compile(r'(?P<comment>#.*?$|[{}]+)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^#\'"{}]*)', re.DOTALL | re.MULTILINE)
php_regex = re.compile(r'(?P<comment>#.*?$|//.*?$|[{}]+)|(?P<multilinecomment>/\*.*?\*/)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^#/\'"{}]*)', re.DOTALL | re.MULTILINE)
ruby_regex = re.compile(r'(?P<comment>#.*?$)|(?P<multilinecomment>=begin.*?=end)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^#=\'"]*)', re.DOTALL | re.MULTILINE)
ruby_partial_comment_regex = re.compile(r'(?P<comment>=begin.*?$|^.*?=end)|(?P<noncomment>\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^#=\'"]*)', re.DOTALL)
# regex for whitespaces except newlines
whitespaces_regex = re.compile(r'[\t\x0b\x0c\r ]+')
def file_type(file_path):
try:
return magic_cookie.from_file(file_path)
except AttributeError:
return magic_cookie.file(file_path)
def verbose_print(text):
if verbose_mode:
print('%s' % text)
def fnv1a_hash(string):
'''
FNV-1a 32bit hash (http://isthe.com/chongo/tech/comp/fnv/)
'''
hash = 2166136261
for c in string:
hash ^= ord(c)
hash *= 16777619
hash &= 0xFFFFFFFF
return hash
def djb2_hash(string):
'''
djb2 hash (http://www.cse.yorku.ca/~oz/hash.html)
'''
hash = 5381
for c in string:
hash = ((hash << 5) + hash) + ord(c)
hash &= 0xFFFFFFFF
return hash
def sdbm_hash(string):
'''
sdbm hash (http://www.cse.yorku.ca/~oz/hash.html)
'''
hash = 0
for c in string:
hash = ord(c) + (hash << 6) + (hash << 16) - hash
hash &= 0xFFFFFFFF
return hash
'''
http://programmers.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed
http://www.partow.net/programming/hashfunctions/index.html
'''