Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ProcessOutput #233

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 203 additions & 35 deletions pyext/src/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,59 +692,148 @@ def write_stats2(self):
for stat in self.dictionary_stats2.keys():
self.write_stat2(stat)


class ProcessOutput(object):
"""A class for reading stat files"""
def __init__(self, filename):
self.filename = filename
self.isstat1 = False
self.isstat2 = False
self.isstat1=False
self.isstat2=False

# open the file
if not self.filename is None:
f = open(self.filename, "r")
else:
raise ValueError("No file name provided. Use -h for help")

# get the keys from the first line
for line in f.readlines():
d = ast.literal_eval(line)
self.klist = list(d.keys())
# check if it is a stat2 file
if "STAT2HEADER" in self.klist:
self.isstat2 = True
for k in self.klist:
if "STAT2HEADER" in str(k):
# if print_header: print k, d[k]
del d[k]
stat2_dict = d
# get the list of keys sorted by value
kkeys = [k[0]
for k in sorted(stat2_dict.items(), key=operator.itemgetter(1))]
self.klist = [k[1]
for k in sorted(stat2_dict.items(), key=operator.itemgetter(1))]
self.invstat2_dict = {}
for k in kkeys:
self.invstat2_dict.update({stat2_dict[k]: k})
else:
IMP.handle_use_deprecated("statfile v1 is deprecated. "
"Please convert to statfile v2.\n")
self.isstat1 = True
self.klist.sort()
self.exp_dict={} # Store all experimental header details here.

# get the {CategoryID:Category} pairs from the first line
line = f.readline()

#Store these keys in a dictionary. Example pair: {109 :'Total_Score'}
self.dict = self.parse_line(line, header=True)
#self.dict = ast.literal_eval(line)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove, not comment out, old stuff. If we want to go back, that's what git is for.


self.klist = list(self.dict.keys())

if "STAT2HEADER" in self.klist:
self.isstat2 = True
for k in self.klist:
if "STAT2HEADER" in str(k):
# if print_header: print k, d[k]
self.exp_dict[k]=self.dict[k]
del self.dict[k]

# get the list of keys sorted by value
kkeys = [k[0]
for k in sorted(self.dict.items(), key=operator.itemgetter(1))]
self.klist = [k[1]
for k in sorted(self.dict.items(), key=operator.itemgetter(1))]
self.inv_dict = {}

for k in kkeys:
self.inv_dict.update({self.dict[k]: k})
else:
print("WARNING: statfile v1 is deprecated. Please convert to statfile v2")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing wrong with your patch here, but when you update it, you should use the "proper" deprecation function, as in current develop.

self.isstat1 = True
self.klist.sort()
# For v1, no need to map from ids to field names, so just make
# a dumb one-to-one mapping so as not to confuse v2 code
self.dict = {}
self.inv_dict = {}
for k in self.klist:
self.dict[k] = k
self.inv_dict[k] = k


break
f.close()


def parse_line(self, line, header=False):
# Parses a line and returns a dictionary of key:value pairs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use a docstring ("foo") rather than comment (#foo) here so it gets picked up by doxygen. (If you don't want it to be part of the public interface - which requires documentation - make it a private function by calling it _parse_line rather than parse_line.)

# {key:value, key:value, key:"{v1, v2, v3}", key:value}
d={} # output dictionary

if header:
# Assume that STAT2HEADER_ENVIRON is the last keyword
if "STAT2HEADER_ENVIRON" in line:
d['STAT2HEADER_ENVIRON']=line.split(", \'STAT2HEADER_ENVIRON\':")[1]
line=line.split(", \'STAT2HEADER_ENVIRON\'")[0]

# First, remove outer braces and split via double quotes to isolate multi-component values.
split=line.strip()[1:-1].split("\"")

if len(split)==1:
fields = split[0].split(",") # split via commas to get key:value pair
for h in fields:
if h != "": # For some reason, there is occasionally an empty field. Ignoring these seems to work.

# Split fields into key and value elements
kv = h.split(":")

# If the key (field 0) is an integer, keep it an integer.
try:
k = int(kv[0].replace(",","").strip())
except:
k = kv[0].replace("\'","").strip()

v = kv[1].replace("\'","").strip() # the value is encased in single quote characters, so remove these
d[k] = v
return d


for i in range(0,len(split),2):
# Each even number of split is a string of "key:value, key:value, key:value"
fd = split[i]
fields = fd.split(",") # split via commas to get key:value pair
for h in fields[0:-1]:
if h != "": # For some reason, there is occasionally an empty field. Ignoring these seems to work.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't duplicate all this code from above. Put it in a function.

#print(h, h.split(": "))

# Fields are separated by ': '
kv = h.split(": ")

# If the key (field 0) is an integer, keep it an integer.
try:
k = int(kv[0].replace(",","").replace(":","").strip())
except:
k = kv[0].replace("\'","").strip()

v = kv[1].replace("\'","").strip() # the value is encased in single quote characters, so remove these
d[k] = v

# If there is a last field contains the key for the multi-component value in split[i+1]
if i < len(split)-1:
d[fields[-1].split(":")[0].replace("\'","").strip()] = split[i+1]
else:
kv = fields[-1].split(": ")
try:
k = int(kv[0].replace(",","").replace(":","").strip())
except:
k = kv[0].replace("\'","").strip()

v = kv[1].replace("\'","").strip() # the value is encased in single quote characters, so remove these
d[k] = v

return d


def get_keys(self):
""" Returns a list of the string keys that are included in this dictionary
"""
self.klist = [k[1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do this only once, not each time the function is called. Easiest would just be to setup klist when the file is first parsed.

for k in sorted(self.dict.items(), key=operator.itemgetter(1))]
return self.klist

def show_keys(self, ncolumns=2, truncate=65):
IMP.pmi.tools.print_multicolumn(self.get_keys(), ncolumns, truncate)

def get_experimental_values(self):
return self.exp_dict
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should add a testcase to make sure this is returning non-garbage.


def get_fields(self, fields, filtertuple=None, filterout=None, get_every=1):
'''
Get the desired field names, and return a dictionary.

@param fields desired field names
@param filterout specify if you want to "grep" out something from
the file, so that it is faster
Expand Down Expand Up @@ -773,7 +862,7 @@ def get_fields(self, fields, filtertuple=None, filterout=None, get_every=1):
#if line_number % 1000 == 0:
# print "ProcessOutput.get_fields: read line %s from file %s" % (str(line_number), self.filename)
try:
d = ast.literal_eval(line)
d = eval(line)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use ast.literal_eval rather than eval here.

except:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never use a bare except:. Catch a specific range of exceptions, e.g. except ValueError:.

print("# Warning: skipped line number " + str(line_number) + " not a valid line")
continue
Expand Down Expand Up @@ -804,20 +893,99 @@ def get_fields(self, fields, filtertuple=None, filterout=None, get_every=1):
relationship = filtertuple[1]
value = filtertuple[2]
if relationship == "<":
if float(d[self.invstat2_dict[keytobefiltered]]) >= value:
if float(d[self.inv_dict[keytobefiltered]]) >= value:
continue
if relationship == ">":
if float(d[self.invstat2_dict[keytobefiltered]]) <= value:
if float(d[self.inv_dict[keytobefiltered]]) <= value:
continue
if relationship == "==":
if float(d[self.invstat2_dict[keytobefiltered]]) != value:
if float(d[self.inv_dict[keytobefiltered]]) != value:
continue

[outdict[field].append(d[self.invstat2_dict[field]])
[outdict[field].append(d[self.inv_dict[field]])
for field in fields]
f.close()
return outdict

def return_models_satisfying_criteria(self, criteria):
# Given a set of criteria, return lines from the stat file that
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docstring

# satisfy all criteria
#
# Criteria should as a list of tuples in the format: ("TheKeyToBeFiltered",relationship,value)
# where relationship = "<", "==", or ">"
# and keytobefiltered is the name of the key (rather than the integer)
#
# Returns a list of dictionaries
output_list = []
i = 0

# print fields values
f = open(self.filename, "r")
line_number = 1

# skip the first line for a statfile v2
if self.isstat2:
f.readline()

for line in f.readlines():
append=True
#fields = ast.literal_eval(line)
fields = self.parse_line(line)

# Loop over all criteria. If one fails, the whole line fails and do not append it.
for c in criteria:
if not self.does_line_pass_criteria(fields, c):
append=False
break
if append:
output_list.append(fields)

return output_list

def _float_string(self, c):
# Returns a float if the string can be cast as such.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docstring

# otherwise, just return the string
try:
float(c)
except:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no bare except:

return str(c)

return float(c)


def does_line_pass_criteria(self, fields, c):
# Given a stat file line (as a dictionary) and a criteria tuple, decide whether
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docstring

# the criteria is passed (return True) or not (return False)
#print(c)
key = c[0]
if key not in self.get_keys():
print(key, self.get_keys())
raise Exception('ERROR: IMP.pmi.output.ProcessOutput.does_line_pass_criteria() - Key %s is not in this stat file' % (key))

# Try to cast value string as int, float or, if not, keep it as a string
value = self._float_string(c[1])


comparison = c[2]
if comparison not in ["==", "<", ">"]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally considered better style to use () rather than [] here, since the set of items is immutable.

raise Exception('ERROR: IMP.pmi.output.ProcessOutput.does_line_pass_criteria() - Comparison string must be \'>\', \'<\' or \'==\', instead of %s' % (comparison))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Raise a more specific exception. ValueError is probably the most appropriate here.


intkey = self.inv_dict[key]

model_value = self._float_string(fields[intkey])

if type(value) is not type(model_value):
raise Exception('ERROR: IMP.pmi.output.ProcessOutput.does_line_pass_criteria() - Comparison field %s is of type %s while you tried to compare it to a %s' % (key, type(model_value), type(value)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more specific exception


if (comparison == "==" and model_value == value) or \
(comparison == ">" and model_value <= value) or \
(comparison == "<" and model_value >= value):
return True

else:
return False




class CrossLinkIdentifierDatabase(object):
Expand Down