v0.6.3 numpy2 scipy1.14 fixes

flatironinstitute · Jul 8, 2024 · eb7bbb1 · eb7bbb1
1 parent 17bf404
commit eb7bbb1
Show file tree

Hide file tree

Showing 14 changed files with 207 additions and 99 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v3

diff --git a/inferelator/preprocessing/data_normalization.py b/inferelator/preprocessing/data_normalization.py
@@ -10,6 +10,7 @@
 
 from inferelator.utils.debug import Debug
 from inferelator.utils.data import convert_array_to_float
+from inferelator.utils.sparse import todense
 
 
 class TruncRobustScaler(RobustScaler):
@@ -303,7 +304,7 @@ def scale_array(
     :type magnitude_limit: numeric, optional
     """
 
-    if sparse.isspmatrix(array):
+    if sparse.issparse(array):
         out = np.empty(
             shape=array.shape,
             dtype=float
@@ -340,8 +341,8 @@ def scale_vector(
     """
 
     # Convert a sparse vector to a dense vector
-    if sparse.isspmatrix(vec):
-        vec = vec.A.ravel()
+    if sparse.issparse(vec):
+        vec = todense(vec).ravel()
 
     # Return 0s if the variance is 0
     if np.var(vec) == 0:
@@ -358,7 +359,7 @@ def scale_vector(
 
 def _magnitude_limit(x, lim):
 
-    ref = x.data if sparse.isspmatrix(x) else x
+    ref = x.data if sparse.issparse(x) else x
 
     np.minimum(ref, lim, out=ref)
     np.maximum(ref, -1 * lim, out=ref)

diff --git a/inferelator/preprocessing/velocity.py b/inferelator/preprocessing/velocity.py
@@ -6,6 +6,7 @@
     Debug
 )
 from inferelator.utils import Validator as check
+from inferelator.utils.sparse import todense
 
 
 def extract_transcriptional_output(
@@ -290,9 +291,9 @@ def _sparse_safe_multiply(x, y):
     :rtype: np.ndarray, sp.spmatrix
     """
 
-    if sparse.isspmatrix(x):
+    if sparse.issparse(x):
         return x.multiply(y).tocsr()
-    elif sparse.isspmatrix(y):
+    elif sparse.issparse(y):
         return y.multiply(x).tocsr()
     else:
         return np.multiply(x, y)
@@ -310,7 +311,7 @@ def _sparse_safe_add(x, y):
     :rtype: np.ndarray
     """
 
-    if sparse.isspmatrix(x) or sparse.isspmatrix(y):
-        return (x + y).A
+    if sparse.issparse(x) or sparse.issparse(y):
+        return todense(x + y).A
     else:
         return np.add(x, y)
diff --git a/inferelator/regression/mi.py b/inferelator/regression/mi.py
@@ -7,7 +7,8 @@
 from inferelator.utils import (
     Debug,
     array_set_diag,
-    safe_apply_to_array
+    safe_apply_to_array,
+    todense
 )
 from inferelator.utils import Validator as check
 
@@ -17,22 +18,44 @@
 # DDOF for CLR
 CLR_DDOF = 1
 
-# Log type for MI calculations. np.log2 gives results in bits; np.log gives results in nats
+# Log type for MI calculations
+# np.log2 gives results in bits; np.log gives results in nats
 DEFAULT_LOG_TYPE = np.log
 
 
 class MIDriver:
 
     @staticmethod
-    def run(x, y, bins=DEFAULT_NUM_BINS, logtype=DEFAULT_LOG_TYPE, return_mi=True):
-        return context_likelihood_mi(x, y, bins=bins, logtype=logtype, return_mi=return_mi)
+    def run(
+        x,
+        y,
+        bins=DEFAULT_NUM_BINS,
+        logtype=DEFAULT_LOG_TYPE,
+        return_mi=True
+    ):
+        return context_likelihood_mi(
+            x,
+            y,
+            bins=bins,
+            logtype=logtype,
+            return_mi=return_mi
+        )
 
 
-def context_likelihood_mi(x, y, bins=DEFAULT_NUM_BINS, logtype=DEFAULT_LOG_TYPE, return_mi=True):
+def context_likelihood_mi(
+    x,
+    y,
+    bins=DEFAULT_NUM_BINS,
+    logtype=DEFAULT_LOG_TYPE,
+    return_mi=True
+):
     """
-    Wrapper to calculate the Context Likelihood of Relatedness and Mutual Information for two data sets that have
-    common condition rows. The y argument will be used to calculate background MI for the x & y MI.
-    As an implementation detail, y will be cast to a dense array if it is sparse.
+    Wrapper to calculate the Context Likelihood of Relatedness and
+    Mutual Information for two data sets that have
+    common condition rows. The y argument will be used to calculate
+    background MI for the x & y MI.
+    As an implementation detail, y will be cast to a dense array if
+    it is sparse.
     X can be sparse with no internal copy.
 
     This function handles unpacking and packing the InferelatorData.
@@ -41,13 +64,15 @@ def context_likelihood_mi(x, y, bins=DEFAULT_NUM_BINS, logtype=DEFAULT_LOG_TYPE,
     :type x: InferelatorData [N x G]
     :param y: An N x K InferelatorData object
     :type y: InferelatorData [N x K]
-    :param logtype: The logarithm function to use when calculating information. Defaults to natural log (np.log)
+    :param logtype: The logarithm function to use when calculating information.
+        Defaults to natural log (np.log)
     :type logtype: np.log func
     :param bins: Number of bins for discretizing continuous variables
     :type bins: int
     :param return_mi: Boolean for returning a MI object. Defaults to True
     :type return_mi: bool
-    :return clr, mi: CLR and MI InferelatorData objects. Returns (CLR, None) if return_mi is False.
+    :return clr, mi: CLR and MI InferelatorData objects.
+        Returns (CLR, None) if return_mi is False.
     :rtype InferelatorData, InferelatorData:
     """
 
@@ -61,11 +86,21 @@ def context_likelihood_mi(x, y, bins=DEFAULT_NUM_BINS, logtype=DEFAULT_LOG_TYPE,
     mi_c = y.gene_names
 
     # Build a [G x K] mutual information array
-    mi = mutual_information(x.expression_data, y.expression_data, bins, logtype=logtype)
+    mi = mutual_information(
+        x.expression_data,
+        y.expression_data,
+        bins,
+        logtype=logtype
+    )
     array_set_diag(mi, 0., mi_r, mi_c)
 
     # Build a [K x K] mutual information array
-    mi_bg = mutual_information(y.expression_data, y.expression_data, bins, logtype=logtype)
+    mi_bg = mutual_information(
+        y.expression_data,
+        y.expression_data,
+        bins,
+        logtype=logtype
+    )
     array_set_diag(mi_bg, 0., mi_c, mi_c)
 
     # Calculate CLR
@@ -79,24 +114,27 @@ def context_likelihood_mi(x, y, bins=DEFAULT_NUM_BINS, logtype=DEFAULT_LOG_TYPE,
 
 def mutual_information(x, y, bins, logtype=DEFAULT_LOG_TYPE):
     """
-    Calculate the mutual information matrix between two data matrices, where the columns are equivalent conditions
+    Calculate the mutual information matrix between two data matrices,
+    where the columns are equivalent conditions
 
     :param x: np.array (n x m1)
         The data from m1 variables across n conditions
     :param y: np.array (n x m2)
         The data from m2 variables across n conditions
     :param bins: int
-        Number of bins to discretize continuous data into for the generation of a contingency table
+        Number of bins to discretize continuous data into for the generation
+        of a contingency table
     :param logtype: np.log func
-        Which type of log function should be used (log2 results in MI bits, log results in MI nats, log10... is weird)
+        Which type of log function should be used (log2 results in MI bits,
+        log results in MI nats, log10... is weird)
 
     :return mi: pd.DataFrame (m1 x m2)
         The mutual information between variables m1 and m2
     """
 
     # Discretize the input matrix y
     y = _make_array_discrete(
-        y.A if sps.isspmatrix(y) else y,
+        todense(y),
         bins,
         axis=0
     )
@@ -120,31 +158,30 @@ def mutual_information(x, y, bins, logtype=DEFAULT_LOG_TYPE):
 
     return mi
 
+
 def _mi_wrapper(x, Y, i, bins, logtype, m1):
 
     Debug.vprint(
         f"Mutual Information Calculation [{i} / {m1}]",
         level=2 if i % 1000 == 0 else 3
     )
 
-    # Turn off runtime warnings (there is an explicit check for NaNs and INFs in-function)
-    with np.errstate(divide='ignore', invalid='ignore'):
+    discrete_X = _make_discrete(
+        todense(x).ravel(),
+        bins
+    )
 
-        discrete_X = _make_discrete(
-            x.A.ravel() if sps.isspmatrix(x) else x.ravel(),
-            bins
+    return [
+        _calc_mi(
+            _make_table(
+                discrete_X, Y[:, j],
+                bins
+            ),
+            logtype=logtype
         )
+        for j in range(Y.shape[1])
+    ]
 
-        return [
-            _calc_mi(
-                _make_table(
-                    discrete_X, Y[:, j],
-                    bins
-                ),
-                logtype=logtype)
-                for j in range(Y.shape[1]
-            )
-        ]
 
 def _x_generator(X):
 
@@ -154,7 +191,8 @@ def _x_generator(X):
 
 def calc_mixed_clr(mi, mi_bg):
     """
-    Calculate the context liklihood of relatedness from mutual information and the background mutual information
+    Calculate the context liklihood of relatedness from mutual information
+    and the background mutual information
 
     :param mi: Mutual information array [m1 x m2]
     :type mi: np.ndarray
@@ -167,12 +205,16 @@ def calc_mixed_clr(mi, mi_bg):
     with np.errstate(invalid='ignore'):
 
         # Calculate the zscore for the dynamic CLR
-        z_dyn = np.round(mi, 10)  # Rounding so that float precision differences don't turn into huge CLR differences
+        # Rounding so that float precision differences don't turn
+        # into huge CLR differences
+        z_dyn = np.round(mi, 10)
         z_dyn = np.subtract(z_dyn, np.mean(mi, axis=0))
         z_dyn = np.divide(z_dyn, np.std(mi, axis=0, ddof=CLR_DDOF))
 
         # Calculate the zscore for the static CLR
-        z_stat = np.round(mi, 10)  # Rounding so that float precision differences don't turn into huge CLR differences
+        # Rounding so that float precision differences don't turn
+        # into huge CLR differences
+        z_stat = np.round(mi, 10)
         z_stat = np.subtract(z_stat, np.mean(mi_bg, axis=0))
         z_stat = np.divide(z_stat, np.std(mi_bg, axis=0, ddof=CLR_DDOF))
 
@@ -222,15 +264,17 @@ def _make_discrete(arr_vec, num_bins):
     # Continuous values to discrete bins [0, num_bins)
     # Write directly into a np.int16 array with the standard unsafe conversion
     return np.floor(
-        (arr_vec - arr_min) / (arr_max - arr_min + np.spacing(arr_max - arr_min)) * num_bins,
+        (arr_vec - arr_min) /
+        (arr_max - arr_min + np.spacing(arr_max - arr_min)) * num_bins,
         out=np.zeros(shape=arr_vec.shape, dtype=np.int16),
         casting='unsafe'
     )
 
 
 def _make_table(x, y, num_bins):
     """
-    Takes two variable vectors which have been made into discrete integer bins and constructs a contingency table
+    Takes two variable vectors which have been made into discrete integer
+    bins and constructs a contingency table
     :param x: np.ndarray
         1d array of discrete data
     :param y: np.ndarray
@@ -242,7 +286,8 @@ def _make_table(x, y, num_bins):
     """
 
     # The only fast way to do this is by reindexing the table as an index array
-    # Then piling everything up with bincount and reshaping it back into the table
+    # Then piling everything up with bincount and reshaping it back into the
+    # table
     return np.bincount(
         x * num_bins + y,
         minlength=num_bins ** 2
@@ -264,24 +309,40 @@ def _calc_mi(table, logtype=DEFAULT_LOG_TYPE):
         Mutual information between variable x & y
     """
 
+    # [n]
     total = np.sum(table)
 
     # (PxPy) [n x n]
-    mi_val = np.dot((np.sum(table, axis=1) / total).reshape(-1, 1),
-                    (np.sum(table, axis=0) / total).reshape(1, -1))
+    mi_val = np.dot(
+        (np.sum(table, axis=1) / total).reshape(-1, 1),
+        (np.sum(table, axis=0) / total).reshape(1, -1)
+    )
 
     # (Pxy) [n x n]
-    table = np.divide(table, total)
+    table = np.divide(
+        table,
+        total,
+        out=np.zeros(table.shape, float),
+        where=total != 0
+    )
 
     # (Pxy)/(PxPy) [n x n]
-    mi_val = np.divide(table, mi_val)
+    mi_val = np.divide(
+        table,
+        mi_val,
+        out=np.zeros(table.shape, float),
+        where=mi_val != 0
+    )
 
     # log[(Pxy)/(PxPy)] [n x n]
-    mi_val = logtype(mi_val)
+    mi_val = logtype(
+        mi_val,
+        out=mi_val,
+        where=mi_val != 0
+    )
 
     # Pxy(log[(Pxy)/(PxPy)]) [n x n]
     mi_val = np.multiply(table, mi_val)
-    mi_val[np.isnan(mi_val)] = 0
 
     # Summation
     return np.sum(mi_val)