Skip to content

Commit

Permalink
Squashed 'modules/pmi/' changes from 95043d139f..6103587aa0
Browse files Browse the repository at this point in the history
6103587aa0 Add more docs to ProtocolOutput
348f82fd40 flake8 fix
86a8bbbb6b Trim non-represented residues from IHM sequence

git-subtree-dir: modules/pmi
git-subtree-split: 6103587aa0e22bf1d0864e67cc859e3ed794d537
  • Loading branch information
benmwebb committed Sep 27, 2024
1 parent 7b6753f commit 56b9461
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 36 deletions.
116 changes: 98 additions & 18 deletions modules/pmi/pyext/src/mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -1184,12 +1184,21 @@ def __set_name(self, val):


class Entity(ihm.Entity):
"""A single entity in the system.
"""A single entity in the system. This contains information (such as
database identifiers) specific to a particular sequence rather than
a copy (for example, when modeling a homodimer, two AsymUnits will
point to the same Entity).
This functions identically to the base ihm.Entity class, but it
allows identifying residues by either the IHM numbering scheme
(seq_id, which is always contiguous starting at 1) or by the PMI
scheme (which is similar, but may not start at 1 if the sequence in
the FASTA file has one or more N-terminal gaps"""
allows identifying residues by either the PMI numbering scheme
(which is always contiguous starting at 1, covering the entire
sequence in the FASTA files, or the IHM scheme (seq_id, which also
starts at 1, but which only covers the modeled subset of the full
sequence, with non-modeled N-terminal or C-terminal residues
removed). The actual offset (which is the integer to be added to the
IHM numbering to get PMI numbering, or equivalently the number of
not-represented N-terminal residues in the PMI sequence) is
available in the `pmi_offset` member."""
def __init__(self, sequence, pmi_offset, *args, **keys):
# Offset between PMI numbering and IHM; <pmi_#> = <ihm_#> + pmi_offset
# (pmi_offset is also the number of N-terminal gaps in the FASTA file)
Expand All @@ -1207,12 +1216,22 @@ def pmi_range(self, res_id_begin, res_id_end):


class AsymUnit(ihm.AsymUnit):
"""A single asymmetric unit in the system.
"""A single asymmetric unit in the system. This roughly corresponds to
a single PMI subunit (sequence, copy, or clone).
This functions identically to the base ihm.AsymUnit class, but it
allows identifying residues by either the IHM numbering scheme
(seq_id, which is always contiguous starting at 1) or by the PMI
scheme (which is similar, but may not start at 1 if the sequence in
the FASTA file has one or more N-terminal gaps"""
allows identifying residues by either the PMI numbering scheme
(which is always contiguous starting at 1, covering the entire
sequence in the FASTA files, or the IHM scheme (seq_id, which also
starts at 1, but which only covers the modeled subset of the full
sequence, with non-modeled N-terminal or C-terminal residues
removed).
The `entity` member of this class points to an Entity object, which
contains information (such as database identifiers) specific to
a particular sequence rather than a copy (for example, when modeling
a homodimer, two AsymUnits will point to the same Entity).
"""

def __init__(self, entity, *args, **keys):
super().__init__(
Expand Down Expand Up @@ -1243,7 +1262,14 @@ class ProtocolOutput(IMP.pmi.output.ProtocolOutput):
[python-ihm API](https://python-ihm.readthedocs.io/en/latest/dumper.html#ihm.dumper.write)
to write out files in mmCIF or BinaryCIF format.
See also get_handlers(), get_dumpers().
Each PMI subunit will be mapped to an IHM AsymUnit class which contains the
subset of the sequence that was represented. Use the `asym_units` dict to
get this object given a PMI subunit name. Each unique sequence will be
mapped to an IHM Entity class (for example when modeling a homodimer
there will be two AsymUnits which both point to the same Entity). Use
the `entities` dict to get this object from a PMI subunit name.
See also Entity, AsymUnit, get_handlers(), get_dumpers().
""" # noqa: E501
def __init__(self):
# Ultimately, collect data in an ihm.System object
Expand Down Expand Up @@ -1375,18 +1401,15 @@ def add_component_sequence(self, state, name, seq, asym_name=None,
if asym_name is None:
asym_name = name

def get_offset(seq):
# Count length of gaps at start of sequence
for i in range(len(seq)):
if seq[i] != '-':
return seq[i:], i
seq, offset = get_offset(seq)
if name in self.sequence_dict:
if self.sequence_dict[name] != seq:
raise ValueError("Sequence mismatch for component %s" % name)
else:
self.sequence_dict[name] = seq
self.entities.add(name, seq, offset, alphabet)
# Offset is always zero to start with; this may be modified
# later in finalize_build() if any non-modeled N-terminal
# residues are removed
self.entities.add(name, seq, 0, alphabet)
if asym_name in self.asym_units:
if self.asym_units[asym_name] is None:
# Set up a new asymmetric unit for this component
Expand All @@ -1396,6 +1419,12 @@ def get_offset(seq):
self.asym_units[asym_name] = asym
state.modeled_assembly.append(self.asym_units[asym_name])

def finalize_build(self):
"""Called immediately after the PMI system is built"""
for entity in self.system.entities:
_trim_unrep_termini(entity, self.system.asym_units,
self.system.orphan_representations)

def _add_restraint_model_fits(self):
"""Add fits to restraints for all known models"""
for group, m in self.system._all_models():
Expand Down Expand Up @@ -1740,3 +1769,54 @@ def parse_file(self, filename):
elif line.startswith('# ncenters: '):
ret['number_of_gaussians'] = int(line[12:])
return ret


def _trim_unrep_termini(entity, asyms, representations):
"""Trim Entity sequence to only cover represented residues.
PDB policy is for amino acid Entity sequences to be polymers (so
they should include any loops or other gaps in the midst of the
sequence) but for the termini to be trimmed of any not-modeled
residues. Here, we modify the Entity sequence to remove any parts
that are not included in any representation. This may change the
numbering if any N-terminal residues are removed, and thus the offset
between PMI and IHM numbering, as both count from 1."""
rep_range = None
for rep in representations:
for seg in rep:
if seg.asym_unit.entity is entity:
seg_range = seg.asym_unit.seq_id_range
if rep_range is None:
rep_range = list(seg_range)
else:
rep_range[0] = min(rep_range[0], seg_range[0])
rep_range[1] = max(rep_range[1], seg_range[1])
# Do nothing if no representations or if everything is represented
if rep_range is None or rep_range == [1, len(entity.sequence)]:
return
# Update offset (= number of unrepresented N terminal entity residues)
# in entity and all asyms
pmi_offset = rep_range[0] - 1
entity.pmi_offset = pmi_offset
for asym in asyms:
if asym.entity is entity:
asym.auth_seq_id_map = entity.pmi_offset
# Modify entity sequence to only include represented residues
entity.sequence = entity.sequence[rep_range[0] - 1:rep_range[1]]

# If N terminal deletions, we must fix the numbering of all segments
# and starting models
if pmi_offset != 0:
for rep in representations:
for seg in rep:
if seg.asym_unit.entity is entity:
seg_range = seg.asym_unit.seq_id_range
seg.asym_unit.seq_id_range = (seg_range[0] - pmi_offset,
seg_range[1] - pmi_offset)
if seg.starting_model:
model = seg.starting_model
seg_range = model.asym_unit.seq_id_range
model.asym_unit.seq_id_range = \
(seg_range[0] - pmi_offset,
seg_range[1] - pmi_offset)
model.offset = model.offset - pmi_offset
2 changes: 2 additions & 0 deletions modules/pmi/pyext/src/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ def build(self, **kwargs):
for state in self.states:
state.build(**kwargs)
self.built = True
for po in self._protocol_output:
po.finalize_build()
return self.hier

def add_protocol_output(self, p):
Expand Down
39 changes: 24 additions & 15 deletions modules/pmi/test/test_mmcif.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,11 +661,19 @@ def test_starting_model_dumper(self):
nup84_2.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
nup84_2.add_representation(resolutions=[1])

nup85 = st2.create_molecule("Nup85", "SELM", "B")
nup85 = st2.create_molecule("Nup85", "GGGGSELMGG", "B")
# Residues S, E should be residues 8, 9 in test.nup85.pdb; map them
# to FASTA sequence
nup85.add_structure(self.get_input_file_name('test.nup85.pdb'), 'A',
res_range=(8, 9), offset=-7)
nup85.add_representation(resolutions=[1])
res_range=(8, 9), offset=-3)
nup85.add_representation(residues=nup85[4:8], resolutions=[1])
_ = s.build()
# Since we didn't represent the first 4 or last 2 residues in nup85,
# the IHM sequence should just be "SELM", and the PDB offset should be
# modified to match (from -3 to -7)
self.assertEqual(''.join(x.code
for x in po.system.entities[1].sequence),
'SELM')

self.assign_entity_asym_ids(po.system)

Expand Down Expand Up @@ -1214,26 +1222,27 @@ class DummyRestraint:
s.add_protocol_output(po)
state = s.create_state()
po_state = state._protocol_output[0][1]
nup84 = state.create_molecule("Nup84", "MELS", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
nup84.add_representation(nup84.get_atomic_residues(), resolutions=[1])
nup84.add_representation(nup84.get_non_atomic_residues(),
resolutions=[10])
# First 10 residues are not represented, so PMI residue indexes
# range from 11 to 14, but IHM seq_ids from 1 to 4
nup84 = state.create_molecule("Nup84", "P" * 10 + "MELS", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
offset=10)
nup84.add_representation(nup84[10:], resolutions=[1])
hier = s.build()

r = DummyRestraint()
r.dataset = DummyDataset()
r.dataset._id = 42
xl_group = po.get_cross_link_group(r)
ex_xl = po.add_experimental_cross_link(1, 'Nup84',
2, 'Nup84', xl_group)
_ = po.add_experimental_cross_link(1, 'Nup84',
3, 'Nup84', xl_group)
ex_xl = po.add_experimental_cross_link(11, 'Nup84',
12, 'Nup84', xl_group)
_ = po.add_experimental_cross_link(11, 'Nup84',
13, 'Nup84', xl_group)
# Duplicates should be ignored
po.add_experimental_cross_link(1, 'Nup84', 3, 'Nup84', xl_group)
po.add_experimental_cross_link(11, 'Nup84', 13, 'Nup84', xl_group)
# Non-modeled component should be ignored
nm_ex_xl = po.add_experimental_cross_link(1, 'Nup85',
2, 'Nup84', xl_group)
nm_ex_xl = po.add_experimental_cross_link(11, 'Nup85',
12, 'Nup84', xl_group)
self.assertEqual(nm_ex_xl, None)
sel = IMP.atom.Selection(hier, molecule='Nup84',
resolution=1)
Expand Down
76 changes: 73 additions & 3 deletions modules/pmi/test/test_mmcif_pmi2.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def test_entity(self):
w = ihm.format.CifWriter(fh)
d = ihm.dumper._EntityDumper()
d.finalize(po.system)
self.assertEqual(''.join(x.code
for x in po.system.entities[0].sequence),
'MELS')
d.dump(po.system, w)
out = fh.getvalue()
self.assertEqual(out, """#
Expand All @@ -146,17 +149,37 @@ def test_entity(self):
#
""")

def test_entity_trimmed(self):
"""Test that Entity sequence is trimmed to represented region"""
m = IMP.Model()
s = IMP.pmi.topology.System(m)
po = IMP.pmi.mmcif.ProtocolOutput()
s.add_protocol_output(po)
state = s.create_state()
nup84 = state.create_molecule("Nup84", "GGGMELSGGGG", "A")
# Only "MELS" is represented, so the output entity should be of
# length 4 and weigh 532, not the full PMI sequence
nup84.add_representation(residues=nup84[3:7], resolutions=[1])
hier = s.build()
self.assertEqual(''.join(x.code
for x in po.system.entities[0].sequence),
'MELS')
self.assertAlmostEqual(po.system.entities[0].formula_weight,
532.606, delta=1e-2)

def test_model_representation(self):
"""Test ModelRepresentationDumper with PMI2-style init"""
m = IMP.Model()
s = IMP.pmi.topology.System(m)
po = IMP.pmi.mmcif.ProtocolOutput()
s.add_protocol_output(po)
state = s.create_state()
nup84 = state.create_molecule("Nup84", "MELS", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
nup84.add_representation(resolutions=[1])
nup84 = state.create_molecule("Nup84", "GGGGMELSGG", "A")
nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
offset=4)
nup84.add_representation(resolutions=[1], residues=nup84[4:8])
hier = s.build()
print([x.code for x in po.system.entities[0].sequence])
fh = StringIO()
w = ihm.format.CifWriter(fh)
self.assign_entity_asym_ids(po.system)
Expand Down Expand Up @@ -196,5 +219,52 @@ def test_model_representation(self):
#
""")

def test_poly_seq_scheme_trimmed(self):
"""Test that non-modeled termini are trimmed in scheme tables"""
m = IMP.Model()
s = IMP.pmi.topology.System(m)
po = IMP.pmi.mmcif.ProtocolOutput()
s.add_protocol_output(po)
state = s.create_state()
nup84_a = state.create_molecule("Nup84a", "PWYACMELSWAG", "A")
nup84_a.add_representation(residues=nup84_a[4:7], resolutions=[1])
nup84_b = state.create_molecule("Nup84b", "PWYACMELSWAG", "A")
nup84_b.add_representation(residues=nup84_b[2:5], resolutions=[1])
hier = s.build()
self.assertEqual(len(po.system.entities), 1)
self.assertEqual(po.system.entities[0].pmi_offset, 2)
fh = StringIO()
w = ihm.format.CifWriter(fh)
self.assign_entity_asym_ids(po.system)
self.assign_range_ids(po.system)
d = ihm.dumper._PolySeqSchemeDumper()
d.dump(po.system, w)
out = fh.getvalue()
self.assertEqual(out, """#
loop_
_pdbx_poly_seq_scheme.asym_id
_pdbx_poly_seq_scheme.entity_id
_pdbx_poly_seq_scheme.seq_id
_pdbx_poly_seq_scheme.mon_id
_pdbx_poly_seq_scheme.pdb_seq_num
_pdbx_poly_seq_scheme.auth_seq_num
_pdbx_poly_seq_scheme.pdb_mon_id
_pdbx_poly_seq_scheme.auth_mon_id
_pdbx_poly_seq_scheme.pdb_strand_id
_pdbx_poly_seq_scheme.pdb_ins_code
A 1 1 TYR 3 3 TYR TYR A .
A 1 2 ALA 4 4 ALA ALA A .
A 1 3 CYS 5 5 CYS CYS A .
A 1 4 MET 6 6 MET MET A .
A 1 5 GLU 7 7 GLU GLU A .
B 1 1 TYR 3 3 TYR TYR B .
B 1 2 ALA 4 4 ALA ALA B .
B 1 3 CYS 5 5 CYS CYS B .
B 1 4 MET 6 6 MET MET B .
B 1 5 GLU 7 7 GLU GLU B .
#
""")


if __name__ == '__main__':
IMP.test.main()

0 comments on commit 56b9461

Please sign in to comment.