Squashed 'modules/pmi/' changes from 95043d139f..6103587aa0

6103587aa0 Add more docs to ProtocolOutput 348f82fd40 flake8 fix 86a8bbbb6b Trim non-represented residues from IHM sequence git-subtree-dir: modules/pmi git-subtree-split: 6103587aa0e22bf1d0864e67cc859e3ed794d537
salilab · Sep 27, 2024 · 56b9461 · 56b9461
1 parent 7b6753f
commit 56b9461
Show file tree

Hide file tree

Showing 4 changed files with 197 additions and 36 deletions.
diff --git a/modules/pmi/pyext/src/mmcif.py b/modules/pmi/pyext/src/mmcif.py
@@ -1184,12 +1184,21 @@ def __set_name(self, val):
 
 
 class Entity(ihm.Entity):
-    """A single entity in the system.
+    """A single entity in the system. This contains information (such as
+       database identifiers) specific to a particular sequence rather than
+       a copy (for example, when modeling a homodimer, two AsymUnits will
+       point to the same Entity).
+
        This functions identically to the base ihm.Entity class, but it
-       allows identifying residues by either the IHM numbering scheme
-       (seq_id, which is always contiguous starting at 1) or by the PMI
-       scheme (which is similar, but may not start at 1 if the sequence in
-       the FASTA file has one or more N-terminal gaps"""
+       allows identifying residues by either the PMI numbering scheme
+       (which is always contiguous starting at 1, covering the entire
+       sequence in the FASTA files, or the IHM scheme (seq_id, which also
+       starts at 1, but which only covers the modeled subset of the full
+       sequence, with non-modeled N-terminal or C-terminal residues
+       removed). The actual offset (which is the integer to be added to the
+       IHM numbering to get PMI numbering, or equivalently the number of
+       not-represented N-terminal residues in the PMI sequence) is
+       available in the `pmi_offset` member."""
     def __init__(self, sequence, pmi_offset, *args, **keys):
         # Offset between PMI numbering and IHM; <pmi_#> = <ihm_#> + pmi_offset
         # (pmi_offset is also the number of N-terminal gaps in the FASTA file)
@@ -1207,12 +1216,22 @@ def pmi_range(self, res_id_begin, res_id_end):
 
 
 class AsymUnit(ihm.AsymUnit):
-    """A single asymmetric unit in the system.
+    """A single asymmetric unit in the system. This roughly corresponds to
+       a single PMI subunit (sequence, copy, or clone).
+
        This functions identically to the base ihm.AsymUnit class, but it
-       allows identifying residues by either the IHM numbering scheme
-       (seq_id, which is always contiguous starting at 1) or by the PMI
-       scheme (which is similar, but may not start at 1 if the sequence in
-       the FASTA file has one or more N-terminal gaps"""
+       allows identifying residues by either the PMI numbering scheme
+       (which is always contiguous starting at 1, covering the entire
+       sequence in the FASTA files, or the IHM scheme (seq_id, which also
+       starts at 1, but which only covers the modeled subset of the full
+       sequence, with non-modeled N-terminal or C-terminal residues
+       removed).
+
+       The `entity` member of this class points to an Entity object, which
+       contains information (such as database identifiers) specific to
+       a particular sequence rather than a copy (for example, when modeling
+       a homodimer, two AsymUnits will point to the same Entity).
+       """
 
     def __init__(self, entity, *args, **keys):
         super().__init__(
@@ -1243,7 +1262,14 @@ class ProtocolOutput(IMP.pmi.output.ProtocolOutput):
     [python-ihm API](https://python-ihm.readthedocs.io/en/latest/dumper.html#ihm.dumper.write)
     to write out files in mmCIF or BinaryCIF format.
 
-    See also get_handlers(), get_dumpers().
+    Each PMI subunit will be mapped to an IHM AsymUnit class which contains the
+    subset of the sequence that was represented. Use the `asym_units` dict to
+    get this object given a PMI subunit name. Each unique sequence will be
+    mapped to an IHM Entity class (for example when modeling a homodimer
+    there will be two AsymUnits which both point to the same Entity). Use
+    the `entities` dict to get this object from a PMI subunit name.
+
+    See also Entity, AsymUnit, get_handlers(), get_dumpers().
     """  # noqa: E501
     def __init__(self):
         # Ultimately, collect data in an ihm.System object
@@ -1375,18 +1401,15 @@ def add_component_sequence(self, state, name, seq, asym_name=None,
         if asym_name is None:
             asym_name = name
 
-        def get_offset(seq):
-            # Count length of gaps at start of sequence
-            for i in range(len(seq)):
-                if seq[i] != '-':
-                    return seq[i:], i
-        seq, offset = get_offset(seq)
         if name in self.sequence_dict:
             if self.sequence_dict[name] != seq:
                 raise ValueError("Sequence mismatch for component %s" % name)
         else:
             self.sequence_dict[name] = seq
-            self.entities.add(name, seq, offset, alphabet)
+            # Offset is always zero to start with; this may be modified
+            # later in finalize_build() if any non-modeled N-terminal
+            # residues are removed
+            self.entities.add(name, seq, 0, alphabet)
         if asym_name in self.asym_units:
             if self.asym_units[asym_name] is None:
                 # Set up a new asymmetric unit for this component
@@ -1396,6 +1419,12 @@ def get_offset(seq):
                 self.asym_units[asym_name] = asym
             state.modeled_assembly.append(self.asym_units[asym_name])
 
+    def finalize_build(self):
+        """Called immediately after the PMI system is built"""
+        for entity in self.system.entities:
+            _trim_unrep_termini(entity, self.system.asym_units,
+                                self.system.orphan_representations)
+
     def _add_restraint_model_fits(self):
         """Add fits to restraints for all known models"""
         for group, m in self.system._all_models():
@@ -1740,3 +1769,54 @@ def parse_file(self, filename):
                 elif line.startswith('# ncenters: '):
                     ret['number_of_gaussians'] = int(line[12:])
         return ret
+
+
+def _trim_unrep_termini(entity, asyms, representations):
+    """Trim Entity sequence to only cover represented residues.
+
+       PDB policy is for amino acid Entity sequences to be polymers (so
+       they should include any loops or other gaps in the midst of the
+       sequence) but for the termini to be trimmed of any not-modeled
+       residues. Here, we modify the Entity sequence to remove any parts
+       that are not included in any representation. This may change the
+       numbering if any N-terminal residues are removed, and thus the offset
+       between PMI and IHM numbering, as both count from 1."""
+    rep_range = None
+    for rep in representations:
+        for seg in rep:
+            if seg.asym_unit.entity is entity:
+                seg_range = seg.asym_unit.seq_id_range
+                if rep_range is None:
+                    rep_range = list(seg_range)
+                else:
+                    rep_range[0] = min(rep_range[0], seg_range[0])
+                    rep_range[1] = max(rep_range[1], seg_range[1])
+    # Do nothing if no representations or if everything is represented
+    if rep_range is None or rep_range == [1, len(entity.sequence)]:
+        return
+    # Update offset (= number of unrepresented N terminal entity residues)
+    # in entity and all asyms
+    pmi_offset = rep_range[0] - 1
+    entity.pmi_offset = pmi_offset
+    for asym in asyms:
+        if asym.entity is entity:
+            asym.auth_seq_id_map = entity.pmi_offset
+    # Modify entity sequence to only include represented residues
+    entity.sequence = entity.sequence[rep_range[0] - 1:rep_range[1]]
+
+    # If N terminal deletions, we must fix the numbering of all segments
+    # and starting models
+    if pmi_offset != 0:
+        for rep in representations:
+            for seg in rep:
+                if seg.asym_unit.entity is entity:
+                    seg_range = seg.asym_unit.seq_id_range
+                    seg.asym_unit.seq_id_range = (seg_range[0] - pmi_offset,
+                                                  seg_range[1] - pmi_offset)
+                    if seg.starting_model:
+                        model = seg.starting_model
+                        seg_range = model.asym_unit.seq_id_range
+                        model.asym_unit.seq_id_range = \
+                            (seg_range[0] - pmi_offset,
+                             seg_range[1] - pmi_offset)
+                        model.offset = model.offset - pmi_offset
diff --git a/modules/pmi/pyext/src/topology/__init__.py b/modules/pmi/pyext/src/topology/__init__.py
@@ -201,6 +201,8 @@ def build(self, **kwargs):
             for state in self.states:
                 state.build(**kwargs)
             self.built = True
+            for po in self._protocol_output:
+                po.finalize_build()
         return self.hier
 
     def add_protocol_output(self, p):

diff --git a/modules/pmi/test/test_mmcif.py b/modules/pmi/test/test_mmcif.py
@@ -661,11 +661,19 @@ def test_starting_model_dumper(self):
         nup84_2.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
         nup84_2.add_representation(resolutions=[1])
 
-        nup85 = st2.create_molecule("Nup85", "SELM", "B")
+        nup85 = st2.create_molecule("Nup85", "GGGGSELMGG", "B")
+        # Residues S, E should be residues 8, 9 in test.nup85.pdb; map them
+        # to FASTA sequence
         nup85.add_structure(self.get_input_file_name('test.nup85.pdb'), 'A',
-                            res_range=(8, 9), offset=-7)
-        nup85.add_representation(resolutions=[1])
+                            res_range=(8, 9), offset=-3)
+        nup85.add_representation(residues=nup85[4:8], resolutions=[1])
         _ = s.build()
+        # Since we didn't represent the first 4 or last 2 residues in nup85,
+        # the IHM sequence should just be "SELM", and the PDB offset should be
+        # modified to match (from -3 to -7)
+        self.assertEqual(''.join(x.code
+                                 for x in po.system.entities[1].sequence),
+                         'SELM')
 
         self.assign_entity_asym_ids(po.system)
 
@@ -1214,26 +1222,27 @@ class DummyRestraint:
         s.add_protocol_output(po)
         state = s.create_state()
         po_state = state._protocol_output[0][1]
-        nup84 = state.create_molecule("Nup84", "MELS", "A")
-        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
-        nup84.add_representation(nup84.get_atomic_residues(), resolutions=[1])
-        nup84.add_representation(nup84.get_non_atomic_residues(),
-                                 resolutions=[10])
+        # First 10 residues are not represented, so PMI residue indexes
+        # range from 11 to 14, but IHM seq_ids from 1 to 4
+        nup84 = state.create_molecule("Nup84", "P" * 10 + "MELS", "A")
+        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
+                            offset=10)
+        nup84.add_representation(nup84[10:], resolutions=[1])
         hier = s.build()
 
         r = DummyRestraint()
         r.dataset = DummyDataset()
         r.dataset._id = 42
         xl_group = po.get_cross_link_group(r)
-        ex_xl = po.add_experimental_cross_link(1, 'Nup84',
-                                               2, 'Nup84', xl_group)
-        _ = po.add_experimental_cross_link(1, 'Nup84',
-                                           3, 'Nup84', xl_group)
+        ex_xl = po.add_experimental_cross_link(11, 'Nup84',
+                                               12, 'Nup84', xl_group)
+        _ = po.add_experimental_cross_link(11, 'Nup84',
+                                           13, 'Nup84', xl_group)
         # Duplicates should be ignored
-        po.add_experimental_cross_link(1, 'Nup84', 3, 'Nup84', xl_group)
+        po.add_experimental_cross_link(11, 'Nup84', 13, 'Nup84', xl_group)
         # Non-modeled component should be ignored
-        nm_ex_xl = po.add_experimental_cross_link(1, 'Nup85',
-                                                  2, 'Nup84', xl_group)
+        nm_ex_xl = po.add_experimental_cross_link(11, 'Nup85',
+                                                  12, 'Nup84', xl_group)
         self.assertEqual(nm_ex_xl, None)
         sel = IMP.atom.Selection(hier, molecule='Nup84',
                                  resolution=1)

diff --git a/modules/pmi/test/test_mmcif_pmi2.py b/modules/pmi/test/test_mmcif_pmi2.py
@@ -131,6 +131,9 @@ def test_entity(self):
         w = ihm.format.CifWriter(fh)
         d = ihm.dumper._EntityDumper()
         d.finalize(po.system)
+        self.assertEqual(''.join(x.code
+                                 for x in po.system.entities[0].sequence),
+                         'MELS')
         d.dump(po.system, w)
         out = fh.getvalue()
         self.assertEqual(out, """#
@@ -146,17 +149,37 @@ def test_entity(self):
 #
 """)
 
+    def test_entity_trimmed(self):
+        """Test that Entity sequence is trimmed to represented region"""
+        m = IMP.Model()
+        s = IMP.pmi.topology.System(m)
+        po = IMP.pmi.mmcif.ProtocolOutput()
+        s.add_protocol_output(po)
+        state = s.create_state()
+        nup84 = state.create_molecule("Nup84", "GGGMELSGGGG", "A")
+        # Only "MELS" is represented, so the output entity should be of
+        # length 4 and weigh 532, not the full PMI sequence
+        nup84.add_representation(residues=nup84[3:7], resolutions=[1])
+        hier = s.build()
+        self.assertEqual(''.join(x.code
+                                 for x in po.system.entities[0].sequence),
+                         'MELS')
+        self.assertAlmostEqual(po.system.entities[0].formula_weight,
+                               532.606, delta=1e-2)
+
     def test_model_representation(self):
         """Test ModelRepresentationDumper with PMI2-style init"""
         m = IMP.Model()
         s = IMP.pmi.topology.System(m)
         po = IMP.pmi.mmcif.ProtocolOutput()
         s.add_protocol_output(po)
         state = s.create_state()
-        nup84 = state.create_molecule("Nup84", "MELS", "A")
-        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A')
-        nup84.add_representation(resolutions=[1])
+        nup84 = state.create_molecule("Nup84", "GGGGMELSGG", "A")
+        nup84.add_structure(self.get_input_file_name('test.nup84.pdb'), 'A',
+                            offset=4)
+        nup84.add_representation(resolutions=[1], residues=nup84[4:8])
         hier = s.build()
+        print([x.code for x in po.system.entities[0].sequence])
         fh = StringIO()
         w = ihm.format.CifWriter(fh)
         self.assign_entity_asym_ids(po.system)
@@ -196,5 +219,52 @@ def test_model_representation(self):
 #
 """)
 
+    def test_poly_seq_scheme_trimmed(self):
+        """Test that non-modeled termini are trimmed in scheme tables"""
+        m = IMP.Model()
+        s = IMP.pmi.topology.System(m)
+        po = IMP.pmi.mmcif.ProtocolOutput()
+        s.add_protocol_output(po)
+        state = s.create_state()
+        nup84_a = state.create_molecule("Nup84a", "PWYACMELSWAG", "A")
+        nup84_a.add_representation(residues=nup84_a[4:7], resolutions=[1])
+        nup84_b = state.create_molecule("Nup84b", "PWYACMELSWAG", "A")
+        nup84_b.add_representation(residues=nup84_b[2:5], resolutions=[1])
+        hier = s.build()
+        self.assertEqual(len(po.system.entities), 1)
+        self.assertEqual(po.system.entities[0].pmi_offset, 2)
+        fh = StringIO()
+        w = ihm.format.CifWriter(fh)
+        self.assign_entity_asym_ids(po.system)
+        self.assign_range_ids(po.system)
+        d = ihm.dumper._PolySeqSchemeDumper()
+        d.dump(po.system, w)
+        out = fh.getvalue()
+        self.assertEqual(out, """#
+loop_
+_pdbx_poly_seq_scheme.asym_id
+_pdbx_poly_seq_scheme.entity_id
+_pdbx_poly_seq_scheme.seq_id
+_pdbx_poly_seq_scheme.mon_id
+_pdbx_poly_seq_scheme.pdb_seq_num
+_pdbx_poly_seq_scheme.auth_seq_num
+_pdbx_poly_seq_scheme.pdb_mon_id
+_pdbx_poly_seq_scheme.auth_mon_id
+_pdbx_poly_seq_scheme.pdb_strand_id
+_pdbx_poly_seq_scheme.pdb_ins_code
+A 1 1 TYR 3 3 TYR TYR A .
+A 1 2 ALA 4 4 ALA ALA A .
+A 1 3 CYS 5 5 CYS CYS A .
+A 1 4 MET 6 6 MET MET A .
+A 1 5 GLU 7 7 GLU GLU A .
+B 1 1 TYR 3 3 TYR TYR B .
+B 1 2 ALA 4 4 ALA ALA B .
+B 1 3 CYS 5 5 CYS CYS B .
+B 1 4 MET 6 6 MET MET B .
+B 1 5 GLU 7 7 GLU GLU B .
+#
+""")
+
+
 if __name__ == '__main__':
     IMP.test.main()