-
Notifications
You must be signed in to change notification settings - Fork 2
/
stat_f_v2.py
79 lines (65 loc) · 2.14 KB
/
stat_f_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import codecs
import sys
''' Run the programm from a command line using *** python stat_f.py file_name
'''
def longer_than(data):
longer_counter = 0
for line in data.splitlines():
if len(line.split()) > 80:
longer_counter += 1
return longer_counter
def count_sent(data):
if data.splitlines()[len(data.splitlines())-1] == '':
return len(data.splitlines())-1
else :
return len(data.splitlines())
def count_token(data):
return len(data.split())
def count_word_type(data):
return len(set(data.split()))
def get_longest_sent(data):
largest = 0
for line in data.splitlines():
if len(line.split()) > largest:
largest = len(line.split())
return largest
def get_shortest_sent(data):
shortest = 100
for line in data.splitlines():
if len(line.split()) < shortest:
shortest = len(line.split())
return shortest
def get_average(cont):
avg = 0
total_length = len(cont.splitlines())
for line in cont.splitlines():
avg += len(line.split())
return int((avg/total_length))
def write_log(fname,cont):
print cont
outfile = codecs.open(fname, 'w', 'utf-8')
outfile.write(cont)
outfile.close()
print ' Data written to %s ' % (fname)
if __name__ == '__main__':
file_name = sys.argv[1]
fdata = codecs.open(file_name,'r','utf-8')
cont = fdata.read()
fdata.close()
log_data = '***Processed file ' + file_name + '\n'
sent_count = count_sent(cont)
log_data += ' *** Number of Sentence *** ' + str(sent_count) + '\n'
token_count = count_token(cont)
log_data += ' *** Number of Token *** ' + str(token_count) + '\n'
word_type_count = count_word_type(cont)
log_data += ' *** Number of word type ' + str(word_type_count) + '\n'
longest = get_longest_sent(cont)
log_data += ' *** Length of longest Sentence *** ' + str(longest) + '\n'
shortest = get_shortest_sent(cont)
log_data += ' *** Length of shortest Sentence *** ' + str(shortest) + '\n'
average_length = get_average(cont)
log_data += ' *** Average Sentence length *** ' + str(average_length) + '\n'
longer_sent = longer_than(cont)
log_data += ' *** Sentence longer than 80 *** ' + str(longer_sent) + '\n'
log_file_name = 'stat_log_' + file_name
write_log(log_file_name,log_data)