ENH: Speed up reading of small buffers

larsoner · Jan 9, 2024 · 6999bb0 · 6999bb0
1 parent 4750f0d
commit 6999bb0
Show file tree

Hide file tree

Showing 7 changed files with 133 additions and 197 deletions.
diff --git a/doc/changes/devel/newfeature.rst b/doc/changes/devel/newfeature.rst
@@ -0,0 +1 @@
+Speed up raw FIF reading when using small buffer sizes by `Eric Larson`_.
diff --git a/mne/_fiff/open.py b/mne/_fiff/open.py
@@ -13,7 +13,13 @@
 
 from ..utils import _file_like, logger, verbose, warn
 from .constants import FIFF
-from .tag import Tag, _call_dict_names, _matrix_info, read_tag, read_tag_info
+from .tag import (
+    Tag,
+    _call_dict_names,
+    _matrix_info,
+    _read_tag_header,
+    read_tag,
+)
 from .tree import dir_tree_find, make_dir_tree
 
 
@@ -131,6 +137,7 @@ def fiff_open(fname, preload=False, verbose=None):
         raise
 
 
+# @profile
 def _fiff_open(fname, fid, preload):
     # do preloading of entire file
     if preload:
@@ -139,7 +146,7 @@ def _fiff_open(fname, fid, preload):
         with fid as fid_old:
             fid = BytesIO(fid_old.read())
 
-    tag = read_tag_info(fid)
+    tag = _read_tag_header(fid, 0)
 
     #   Check that this looks like a fif file
     prefix = f"file {repr(fname)} does not"
@@ -152,7 +159,7 @@ def _fiff_open(fname, fid, preload):
     if tag.size != 20:
         raise ValueError(f"{prefix} start with a file id tag")
 
-    tag = read_tag(fid)
+    tag = read_tag(fid, tag.next_pos)
 
     if tag.kind != FIFF.FIFF_DIR_POINTER:
         raise ValueError(f"{prefix} have a directory pointer")
@@ -176,16 +183,15 @@ def _fiff_open(fname, fid, preload):
             directory = dir_tag.data
             read_slow = False
     if read_slow:
-        fid.seek(0, 0)
+        pos = 0
+        fid.seek(pos, 0)
         directory = list()
-        while tag.next >= 0:
-            pos = fid.tell()
-            tag = read_tag_info(fid)
+        while pos is not None:
+            tag = _read_tag_header(fid, pos)
             if tag is None:
                 break  # HACK : to fix file ending with empty tag...
-            else:
-                tag.pos = pos
-                directory.append(tag)
+            pos = tag.next_pos
+            directory.append(tag)
 
     tree, _ = make_dir_tree(fid, directory)
 
@@ -309,7 +315,7 @@ def _show_tree(
         for k, kn, size, pos, type_ in zip(kinds[:-1], kinds[1:], sizes, poss, types):
             if not tag_found and k != tag_id:
                 continue
-            tag = Tag(k, size, 0, pos)
+            tag = Tag(k, size, 0, -1, pos)
             if read_limit is None or size <= read_limit:
                 try:
                     tag = read_tag(fid, pos)

diff --git a/mne/_fiff/tag.py b/mne/_fiff/tag.py
@@ -7,7 +7,9 @@
 import html
 import re
 import struct
+from dataclasses import dataclass
 from functools import partial
+from typing import Any
 
 import numpy as np
 from scipy.sparse import csc_matrix, csr_matrix
@@ -28,40 +30,16 @@
 # HELPERS
 
 
+@dataclass
 class Tag:
-    """Tag in FIF tree structure.
+    """Tag in FIF tree structure."""
 
-    Parameters
-    ----------
-    kind : int
-        Kind of Tag.
-    type_ : int
-        Type of Tag.
-    size : int
-        Size in bytes.
-    int : next
-        Position of next Tag.
-    pos : int
-        Position of Tag is the original file.
-    """
-
-    def __init__(self, kind, type_, size, next, pos=None):
-        self.kind = int(kind)
-        self.type = int(type_)
-        self.size = int(size)
-        self.next = int(next)
-        self.pos = pos if pos is not None else next
-        self.pos = int(self.pos)
-        self.data = None
-
-    def __repr__(self):  # noqa: D105
-        attrs = list()
-        for attr in ("kind", "type", "size", "next", "pos", "data"):
-            try:
-                attrs.append(f"{attr} {getattr(self, attr)}")
-            except AttributeError:
-                pass
-        return "<Tag | " + " - ".join(attrs) + ">"
+    kind: int
+    type: int
+    size: int
+    next: int
+    pos: int
+    data: Any = None
 
     def __eq__(self, tag):  # noqa: D105
         return int(
@@ -73,17 +51,15 @@ def __eq__(self, tag):  # noqa: D105
             and self.data == tag.data
         )
 
-
-def read_tag_info(fid):
-    """Read Tag info (or header)."""
-    tag = _read_tag_header(fid)
-    if tag is None:
-        return None
-    if tag.next == 0:
-        fid.seek(tag.size, 1)
-    elif tag.next > 0:
-        fid.seek(tag.next, 0)
-    return tag
+    @property
+    def next_pos(self):
+        """The next tag position."""
+        if self.next == FIFF.FIFFV_NEXT_SEQ:  # 0
+            return self.pos + 16 + self.size
+        elif self.next > 0:
+            return self.next
+        else:  # self.next should be -1 if we get here
+            return None  # safest to return None so that things like fid.seek die
 
 
 def _frombuffer_rows(fid, tag_size, dtype=None, shape=None, rlims=None):
@@ -157,16 +133,18 @@ def _loc_to_eeg_loc(loc):
 # by the function names.
 
 
-def _read_tag_header(fid):
+def _read_tag_header(fid, pos):
     """Read only the header of a Tag."""
-    s = fid.read(4 * 4)
+    fid.seek(pos, 0)
+    s = fid.read(16)
     if len(s) != 16:
         where = fid.tell() - len(s)
         extra = f" in file {fid.name}" if hasattr(fid, "name") else ""
         warn(f"Invalid tag with only {len(s)}/16 bytes at position {where}{extra}")
         return None
     # struct.unpack faster than np.frombuffer, saves ~10% of time some places
-    return Tag(*struct.unpack(">iIii", s))
+    kind, type_, size, next_ = struct.unpack(">iIii", s)
+    return Tag(kind, type_, size, next_, pos)
 
 
 def _read_matrix(fid, tag, shape, rlims):
@@ -178,10 +156,10 @@ def _read_matrix(fid, tag, shape, rlims):
 
     matrix_coding, matrix_type, bit, dtype = _matrix_info(tag)
 
+    pos = tag.pos + 16
+    fid.seek(pos + tag.size - 4, 0)
     if matrix_coding == "dense":
         # Find dimensions and return to the beginning of tag data
-        pos = fid.tell()
-        fid.seek(tag.size - 4, 1)
         ndim = int(np.frombuffer(fid.read(4), dtype=">i4").item())
         fid.seek(-(ndim + 1) * 4, 1)
         dims = np.frombuffer(fid.read(4 * ndim), dtype=">i4")[::-1]
@@ -205,8 +183,6 @@ def _read_matrix(fid, tag, shape, rlims):
         data.shape = dims
     else:
         # Find dimensions and return to the beginning of tag data
-        pos = fid.tell()
-        fid.seek(tag.size - 4, 1)
         ndim = int(np.frombuffer(fid.read(4), dtype=">i4").item())
         fid.seek(-(ndim + 2) * 4, 1)
         dims = np.frombuffer(fid.read(4 * (ndim + 1)), dtype=">i4")
@@ -388,7 +364,16 @@ def _read_old_pack(fid, tag, shape, rlims):
 
 def _read_dir_entry_struct(fid, tag, shape, rlims):
     """Read dir entry struct tag."""
-    return [_read_tag_header(fid) for _ in range(tag.size // 16 - 1)]
+    pos = tag.pos + 16
+    entries = list()
+    for offset in range(1, tag.size // 16):
+        ent = _read_tag_header(fid, pos + offset * 16)
+        # The position of the real tag on disk is stored in the "next" entry within the
+        # directory, so we need to overwrite ent.pos. For safety let's also overwrite
+        # ent.next to point nowhere
+        ent.pos, ent.next = ent.next, -1
+        entries.append(ent)
+    return entries
 
 
 def _read_julian(fid, tag, shape, rlims):
@@ -439,7 +424,7 @@ def _read_julian(fid, tag, shape, rlims):
     _call_dict_names[key] = dtype
 
 
-def read_tag(fid, pos=None, shape=None, rlims=None):
+def read_tag(fid, pos, shape=None, rlims=None, *, advance=True):
     """Read a Tag from a file at a given position.
 
     Parameters
@@ -456,15 +441,17 @@ def read_tag(fid, pos=None, shape=None, rlims=None):
         Note that data are assumed to be stored row-major in the file. Only to
         be used with data stored as a vector (not implemented for matrices
         yet).
+    advance : bool
+        If True (default), advance to next tag position after reading.
+
+        .. versionadded:: 1.7
 
     Returns
     -------
     tag : Tag
         The Tag read.
     """
-    if pos is not None:
-        fid.seek(pos, 0)
-    tag = _read_tag_header(fid)
+    tag = _read_tag_header(fid, pos)
     if tag is None:
         return tag
     if tag.size > 0:
@@ -477,10 +464,6 @@ def read_tag(fid, pos=None, shape=None, rlims=None):
             except KeyError:
                 raise Exception(f"Unimplemented tag data type {tag.type}") from None
             tag.data = fun(fid, tag, shape, rlims)
-    if tag.next != FIFF.FIFFV_NEXT_SEQ:
-        # f.seek(tag.next,0)
-        fid.seek(tag.next, 1)  # XXX : fix? pb when tag.next < 0
-
     return tag
 
 

diff --git a/mne/_fiff/tree.py b/mne/_fiff/tree.py
@@ -4,12 +4,10 @@
 # License: BSD-3-Clause
 # Copyright the MNE-Python contributors.
 
-import numpy as np
 
 from ..utils import logger, verbose
 from .constants import FIFF
-from .tag import Tag, read_tag
-from .write import _write, end_block, start_block, write_id
+from .tag import read_tag
 
 
 def dir_tree_find(tree, kind):
@@ -108,46 +106,3 @@ def make_dir_tree(fid, directory, start=0, indent=0, verbose=None):
     logger.debug("    " * indent + "end } %d" % block)
     last = this
     return tree, last
-
-
-###############################################################################
-# Writing
-
-
-def copy_tree(fidin, in_id, nodes, fidout):
-    """Copy directory subtrees from fidin to fidout."""
-    if len(nodes) <= 0:
-        return
-
-    if not isinstance(nodes, list):
-        nodes = [nodes]
-
-    for node in nodes:
-        start_block(fidout, node["block"])
-        if node["id"] is not None:
-            if in_id is not None:
-                write_id(fidout, FIFF.FIFF_PARENT_FILE_ID, in_id)
-
-            write_id(fidout, FIFF.FIFF_BLOCK_ID, in_id)
-            write_id(fidout, FIFF.FIFF_PARENT_BLOCK_ID, node["id"])
-
-        if node["directory"] is not None:
-            for d in node["directory"]:
-                #   Do not copy these tags
-                if (
-                    d.kind == FIFF.FIFF_BLOCK_ID
-                    or d.kind == FIFF.FIFF_PARENT_BLOCK_ID
-                    or d.kind == FIFF.FIFF_PARENT_FILE_ID
-                ):
-                    continue
-
-                #   Read and write tags, pass data through transparently
-                fidin.seek(d.pos, 0)
-                tag = Tag(*np.fromfile(fidin, (">i4,>I4,>i4,>i4"), 1)[0])
-                tag.data = np.fromfile(fidin, ">B", tag.size)
-                _write(fidout, tag.data, tag.kind, 1, tag.type, ">B")
-
-        for child in node["children"]:
-            copy_tree(fidin, in_id, child, fidout)
-
-        end_block(fidout, node["block"])
diff --git a/mne/epochs.py b/mne/epochs.py
@@ -39,7 +39,7 @@
     pick_info,
 )
 from ._fiff.proj import ProjMixin, setup_proj
-from ._fiff.tag import read_tag, read_tag_info
+from ._fiff.tag import _read_tag_header, read_tag
 from ._fiff.tree import dir_tree_find
 from ._fiff.utils import _make_split_fnames
 from ._fiff.write import (
@@ -3779,8 +3779,7 @@ def _read_one_epoch_file(f, tree, preload):
             elif kind == FIFF.FIFF_EPOCH:
                 # delay reading until later
                 fid.seek(pos, 0)
-                data_tag = read_tag_info(fid)
-                data_tag.pos = pos
+                data_tag = _read_tag_header(fid, pos)
                 data_tag.type = data_tag.type ^ (1 << 30)
             elif kind in [FIFF.FIFF_MNE_BASELINE_MIN, 304]:
                 # Constant 304 was used before v0.11