Skip to content

Commit

Permalink
implement changes from most recent reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
mobiusklein committed Aug 24, 2024
1 parent df56a98 commit 38026db
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 110 deletions.
144 changes: 135 additions & 9 deletions implementations/python/mzpaf/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,18 @@
and serializing.
"""
import re
from enum import Enum, Flag, auto as enauto
from dataclasses import dataclass, field
from sys import intern
from typing import Any, List, Optional, Pattern, Dict, Tuple, Union
from typing import Any, List, Optional, Pattern, Dict, Tuple, Union, NamedTuple
import warnings

try:
from pyteomics.mass import Composition
except ImportError:
Composition = None
try:
from pyteomics.proforma import (ProForma, FormulaModification)
from pyteomics.proforma import (ProForma, FormulaModification, UnimodModification)
except ImportError:
ProForma = None
FormulaModification = None
Expand Down Expand Up @@ -61,10 +63,19 @@
re.X,
)

# At the time of first writing, this pattern could be translated into the equivalent
# ECMAScript compliant regex:
# ^(?<is_auxiliary>&)?(?:(?<analyte_reference>\d+)@)?(?:(?:(?<series>[axbycz]\.?)(?<ordinal>\d+)(?:\{(?<sequence_ordinal>.+)\})?)|(?<series_internal>[m](?<internal_start>\d+):(?<internal_end>\d+)(?:\{(?<sequence_internal>.+)\})?)|(?<precursor>p)|(:?I(?<immonium>[ARNDCEQGHKMFPSTWYVIL])(?:\[(?<immonium_modification>(?:[^\]]+))\])?)|(?<reference>r(?:(?:\[(?<reference_label>[^\]]+)\])))|(?:f\{(?<formula>[A-Za-z0-9]+)\})|(?:_\{(?<named_compound>[^\{\}\s,\/]+)\})|(?:s\{(?<smiles>[^\}]+)\})|(?:(?<unannotated>\?)(?<unannotated_label>\d+)?))(?<neutral_losses>(?:[+-]\d*(?:(?:[A-Z][A-Za-z0-9]*)|(?:\[(?:(?:[A-Za-z0-9:\.]+))\])))+)?(?:(?<isotope>[+-]\d*)i)?(?:\[(?<adducts>M(:?[+-]\d*[A-Z][A-Za-z0-9]*)+)\])?(?:\^(?<charge>[+-]?\d+))?(?:\/(?<mass_error>[+-]?\d+(?:\.\d+)?)(?<mass_error_unit>ppm)?)?(?:\*(?<confidence>\d*(?:\.\d+)?))?
# Line breaks not introduced to preserve syntactic correctness.

neutral_loss_pattern = re.compile(
r"""\s*(?P<neutral_loss>(?:(?P<sign>[+-])?\s*(?P<coefficient>\d*)\s*
(?:(?P<formula>[A-Z][A-Za-z0-9]*)|
(?P<braced_name>\[
(?:
(?:[A-Za-z0-9:\.]+)(?:\[(?:[A-Za-z0-9\.:\-\ ]+)\])?
)
\])
)
))""",
re.X,
)


def _sre_to_ecma(pattern):
Expand Down Expand Up @@ -129,6 +140,8 @@ def combine_formula(tokens: List[str], leading_sign: bool = False) -> str:
"""
if not tokens:
return ''
if not isinstance(tokens[0], str):
tokens = [str(t) for t in tokens]
if not tokens[0].startswith("-") and leading_sign:
out = ['+' + tokens[0]]
else:
Expand All @@ -141,6 +154,107 @@ def combine_formula(tokens: List[str], leading_sign: bool = False) -> str:
return ''.join(out)


class NeutralNameType(Flag):
Reference = enauto()
Formula = enauto()
BracedName = enauto()
Unknown = enauto()


@dataclass
class NeutralName(object):
name: str
delta_type: NeutralNameType = NeutralNameType.Unknown
coefficient: int = 1

def __post_init__(self):
self.delta_type = self._infer_type()

def format_name(self, leading_sign: bool=True) -> str:
name = self.name
if self.delta_type == NeutralNameType.Reference or self.delta_type == NeutralNameType.BracedName:
name = f"[{name}]"
if self.coefficient >= 0 and leading_sign:
if self.coefficient > 1:
return f'+{self.coefficient}{name}'
else:
return f"+{name}"
elif self.coefficient < 0:
if self.coefficient < -1:
return f"{self.coefficient}{name}"
else:
return f"-{name}"
else:
if self.coefficient > 1:
return f"{self.coefficient}{name}"
else:
return f"{name}"

def __str__(self):
return self.format_name()

def _infer_type(self):
if self.name.startswith('[') and self.name.endswith(']'):
inner_name = self.name = self.name[1:-1]
if ReferenceMolecule.is_reference(inner_name):
tp = NeutralNameType.Reference
else:
tp = NeutralNameType.BracedName
self.name = inner_name
return tp
else:
return NeutralNameType.Formula

def mass(self) -> float:
mass: float = 0.0
if self.delta_type == NeutralNameType.Formula:
mass = FormulaModification(self.name).mass
elif self.delta_type == NeutralNameType.Reference:
mass = ReferenceMolecule.get(self.name[1:-1]).neutral_mass
elif self.delta_type == NeutralNameType.BracedName:
mass = UnimodModification(self.name[1:-1]).mass
else:
raise ValueError(f"Cannot interpret {self.name} with type {self.delta_type}")
return self.coefficient * mass

def __eq__(self, other):
if other is None:
return False
if isinstance(other, str):
if other.startswith("+"):
return self.format_name(True)
else:
return self.format_name(False) == other
return self.name == other.name and self.coefficient == other.coefficient

@classmethod
def parse(cls, string: str) -> List['NeutralName']:
if not string:
return []
names = []
for match in neutral_loss_pattern.finditer(string):
groups = match.groupdict()
coef = int(groups['coefficient'] or 1)
sign = groups['sign'] or '+'
if sign == '-':
coef = -1 * coef
names.append(NeutralName(groups["formula"] or groups["braced_name"], coefficient=coef))
return names

@classmethod
def combine(cls, tokens: List["NeutralName"], leading_sign: bool) -> str:
if not tokens:
return ""
if tokens[0].coefficient >= 0 and leading_sign:
out = [tokens[0].format_name(leading_sign=leading_sign)]
else:
out = [tokens[0].format_name(leading_sign=False)]
for token in tokens[1:]:
out.append(token.format_name(leading_sign=True))
return "".join(out)



class MassError(object):
"""
Represent the mass error of a peak annotation.
Expand Down Expand Up @@ -316,7 +430,7 @@ def serialize(self) -> str:
parts.append(f"{self.analyte_reference}@")
parts.append(self._format_ion())
if self.neutral_losses:
parts.append(combine_formula(
parts.append(NeutralName.combine(
self.neutral_losses, leading_sign=True))
if self.isotope != 0:
sign = "+" if self.isotope > 0 else "-"
Expand Down Expand Up @@ -361,7 +475,7 @@ def to_json(self, exclude_missing=False) -> JSONDict:
"""
#TODO: When neutral losses and adducts are formalized types, convert to string/JSON here
d = {}
skips = ('series', 'rest', 'is_auxiliary')
skips = ('series', 'rest', 'is_auxiliary', 'neutral_losses')
for key in IonAnnotationBase.__slots__:
if key in skips:
continue
Expand All @@ -371,6 +485,7 @@ def to_json(self, exclude_missing=False) -> JSONDict:
value = getattr(self, key)
if (value is not None) or not exclude_missing:
d[key] = value
d['neutral_losses'] = [str(s) for s in self.neutral_losses]
d['molecule_description'] = self._molecule_description()
# if d['analyte_reference'] is None:
# d['analyte_reference'] =
Expand All @@ -383,6 +498,16 @@ def _populate_from_dict(self, data) -> 'IonAnnotationBase':
continue
elif key == 'mass_error' and value is not None:
self.mass_error = MassError(value['value'], value['unit'])
elif key == "neutral_losses" and value is not None:
if isinstance(value, str):
self.neutral_losses = NeutralName.parse(value)
elif isinstance(value, (list, tuple)):
self.neutral_losses = []
for tok in value:
self.neutral_losses.extend(NeutralName.parse(tok))
else:
self.neutral_losses = []
warnings.warn(f"Failed to coerce {value} to neutral losses")
else:
setattr(self, key, value)
self.rest = None
Expand Down Expand Up @@ -927,7 +1052,8 @@ def _coerce_analyte_reference(self, data: Dict[str, str]) -> str:
return data.get("analyte_reference", '1')

def _coerce_neutral_losses(self, data: Dict[str, str]) -> List:
return tokenize_signed_symbol_list(data.get("neutral_losses"))
tokens = NeutralName.parse(data.get("neutral_losses", ''))
return tokens

def _coerce_mass_error(self, data: Dict[str, str]) -> MassError:
mass_error = data.get("mass_error")
Expand Down
25 changes: 0 additions & 25 deletions implementations/python/mzpaf/data/reference_molecules.json
Original file line number Diff line number Diff line change
@@ -1,29 +1,4 @@
{
"Hex": {
"molecule_type": "monosaccharide",
"neutral_mass": 162.0528234185,
"chemical_formula": "C6H10O5"
},
"HexNAc": {
"molecule_type": "monosaccharide",
"neutral_mass": 203.07937251951,
"chemical_formula": "C8H13N1O5"
},
"dHex": {
"molecule_type": "monosaccharide",
"neutral_mass": 146.05790879894,
"chemical_formula": "C6H10O4"
},
"NeuAc": {
"molecule_type": "monosaccharide",
"neutral_mass": 291.09541650647,
"chemical_formula": "C11H17N1O8"
},
"NeuGc": {
"molecule_type": "monosaccharide",
"neutral_mass": 307.09033112603,
"chemical_formula": "C11H17N1O9"
},
"TMT126": {
"label_type": "TMT",
"molecule_type": "reporter",
Expand Down
5 changes: 5 additions & 0 deletions implementations/python/mzpaf/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ def get(cls, name: str) -> 'ReferenceMolecule':
cls._load_registry()
return cls._registry[name]

@classmethod
def is_reference(cls, name: str) -> bool:
if cls._registry is None:
cls._load_registry()
return name in cls._registry


def load_json(stream) -> Dict[str, ReferenceMolecule]:
Expand Down
25 changes: 0 additions & 25 deletions specification/reference_data/reference_molecules.json
Original file line number Diff line number Diff line change
@@ -1,29 +1,4 @@
{
"Hex": {
"molecule_type": "monosaccharide",
"neutral_mass": 162.0528234185,
"chemical_formula": "C6H10O5"
},
"HexNAc": {
"molecule_type": "monosaccharide",
"neutral_mass": 203.07937251951,
"chemical_formula": "C8H13N1O5"
},
"dHex": {
"molecule_type": "monosaccharide",
"neutral_mass": 146.05790879894,
"chemical_formula": "C6H10O4"
},
"NeuAc": {
"molecule_type": "monosaccharide",
"neutral_mass": 291.09541650647,
"chemical_formula": "C11H17N1O8"
},
"NeuGc": {
"molecule_type": "monosaccharide",
"neutral_mass": 307.09033112603,
"chemical_formula": "C11H17N1O9"
},
"TMT126": {
"label_type": "TMT",
"molecule_type": "reporter",
Expand Down
Loading

0 comments on commit 38026db

Please sign in to comment.