Merge pull request #28 from PixelgenTechnologies/feature/exe-1184-bug…

…-with-index-column Fixes to pixeldataset aggregation and parquet file indexeing
PixelgenTechnologies · Oct 23, 2023 · eb15ed2 · eb15ed2
2 parents e9f544e + 563f14d
commit eb15ed2
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [UNRELEASED] - ...
+
+### Fixed
+
+* Fixed broken pixeldataset aggregation for more than two samples.
+* Fixed a bug in graph generation caused by accidentally writing the index to the parquet file.
+  For backwards compatiblity, if there is an column named `index`` in the edgelist, this
+  will be removed and the user will get a warning indicating that this has happened.
+
 ## [0.15.1] - 2023-10-18
 
 ### Fixed

diff --git a/src/pixelator/pixeldataset.py b/src/pixelator/pixeldataset.py
@@ -85,7 +85,7 @@ def _concatenate_edgelists(datasets, sample_names):
         )
         concatenated = concatenated.collect()
 
-        for idx, subsequent in enumerate(datasets[:1], start=1):
+        for idx, subsequent in enumerate(datasets[1:], start=1):
             concatenated = concatenated.extend(
                 subsequent.edgelist_lazy.collect().with_columns(
                     sample=pl.lit(sample_names[idx], dtype=pl.Categorical)
@@ -97,6 +97,7 @@ def _concatenate_edgelists(datasets, sample_names):
                 pl.col("component"), pl.col("sample"), separator="_"
             )
         )
+
     return concatenated
 
 
@@ -137,6 +138,8 @@ def simple_aggregate(
         raise AssertionError(
             "There must be as many sample names provided as there are dataset"
         )
+    if not len(set(sample_names)) == len(sample_names):
+        raise AssertionError("All provided sample names must be unique")
 
     all_var_identical = all(
         map(
@@ -168,6 +171,7 @@ def _add_sample_name_as_obs_col(adata, name):
         edgelists = _enforce_edgelist_types(
             _concatenate_edgelists(datasets, sample_names).to_pandas()
         )
+
     polarizations = pd.concat(
         _get_attr_and_index_by_component(
             "polarization", datasets=datasets, sample_names=sample_names
@@ -501,7 +505,9 @@ class PixelFileParquetFormatSpec(PixelFileFormatSpec):
     @staticmethod
     def serialize_dataframe(dataframe: pd.DataFrame, path: PathType) -> None:
         """Serialize a dataframe from the give path."""
-        dataframe.to_parquet(path, engine="fastparquet", compression="zstd")
+        dataframe.to_parquet(
+            path, engine="fastparquet", compression="zstd", index=False
+        )
 
     @staticmethod
     def deserialize_dataframe(path: PathType, key: str) -> pd.DataFrame:
@@ -640,7 +646,14 @@ def edgelist(self, value: pd.DataFrame) -> None:
     @property
     def edgelist_lazy(self) -> pl.LazyFrame:
         """Get the edge list as a lazy dataframe."""
-        return self._backend.edgelist_lazy
+        lz_edgelist = self._backend.edgelist_lazy
+        if "index" in lz_edgelist.columns:
+            warnings.warn(
+                "A column called `index` was identified in your edgelist. "
+                "This will be removed."
+            )
+            lz_edgelist = lz_edgelist.drop("index")
+        return lz_edgelist
 
     @property
     def polarization(self) -> Optional[pd.DataFrame]:

diff --git a/tests/test_pixeldataset.py b/tests/test_pixeldataset.py
@@ -13,6 +13,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 import pytest
 from anndata import AnnData
 from numpy.testing import assert_array_equal
@@ -70,6 +71,22 @@ def test_pixel_file_parquet_format_spec_can_save(setup_basic_pixel_dataset, tmp_
     assert file_target.is_file()
 
 
+def test_pixel_file_parquet_no_index_in_parquet_files(tmp_path):
+    # Checking EXE-1184
+    # We do not want an index added to the parquet file, regardless of if they
+    # read with pandas or polars
+    file_target = tmp_path / "df.parquet"
+    df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=["X", "Y", "Z"])
+    PixelFileParquetFormatSpec().serialize_dataframe(df, path=file_target)
+    assert file_target.is_file()
+    res1 = pd.read_parquet(file_target)
+    assert set(res1.columns) == {"A", "B"}
+    res2 = pl.scan_parquet(file_target)
+    assert set(res2.columns) == {"A", "B"}
+    res3 = pl.read_parquet(file_target)
+    assert set(res3.columns) == {"A", "B"}
+
+
 def test_pixel_file_csv_format_spec_can_save(setup_basic_pixel_dataset, tmp_path):
     """test_pixel_file_csv_format_spec_can_save."""
     dataset, *_ = setup_basic_pixel_dataset
@@ -715,6 +732,107 @@ def test_filter_should_return_proper_typed_edgelist_data(setup_basic_pixel_datas
     result.graph(result.adata.obs.index[0])
 
 
+def test_on_aggregation_not_passing_unique_sample_names_should_raise(
+    tmp_path,
+    setup_basic_pixel_dataset,
+):
+    dataset_1, *_ = setup_basic_pixel_dataset
+    dataset_2 = dataset_1.copy()
+
+    file_target_1 = tmp_path / "dataset_1.pxl"
+    dataset_1.save(str(file_target_1))
+    file_target_2 = tmp_path / "dataset_2.pxl"
+    dataset_2.save(str(file_target_2))
+
+    with pytest.raises(AssertionError):
+        simple_aggregate(
+            sample_names=["sample1", "sample1"],
+            datasets=[
+                read(file_target_1),
+                read(file_target_2),
+            ],
+        )
+
+
+def test_aggregation_all_samples_show_up(
+    tmp_path,
+    setup_basic_pixel_dataset,
+):
+    dataset_1, *_ = setup_basic_pixel_dataset
+    dataset_2 = dataset_1.copy()
+    dataset_3 = dataset_1.copy()
+    dataset_4 = dataset_1.copy()
+
+    file_target_1 = tmp_path / "dataset_1.pxl"
+    dataset_1.save(str(file_target_1))
+    file_target_2 = tmp_path / "dataset_2.pxl"
+    dataset_2.save(str(file_target_2))
+    file_target_3 = tmp_path / "dataset_3.pxl"
+    dataset_3.save(str(file_target_3))
+    file_target_4 = tmp_path / "dataset_4.pxl"
+    dataset_4.save(str(file_target_4))
+
+    result = simple_aggregate(
+        sample_names=["sample1", "sample2", "sample3", "sample4"],
+        datasets=[
+            read(file_target_1),
+            read(file_target_2),
+            read(file_target_3),
+            read(file_target_4),
+        ],
+    )
+    assert set(result.edgelist["sample"].unique()) == {
+        "sample1",
+        "sample2",
+        "sample3",
+        "sample4",
+    }
+    assert set(result.polarization["sample"].unique()) == {
+        "sample1",
+        "sample2",
+        "sample3",
+        "sample4",
+    }
+    assert set(result.colocalization["sample"].unique()) == {
+        "sample1",
+        "sample2",
+        "sample3",
+        "sample4",
+    }
+    assert set(result.adata.obs["sample"].unique()) == {
+        "sample1",
+        "sample2",
+        "sample3",
+        "sample4",
+    }
+
+
+def test_lazy_edgelist_should_warn_and_rm_on_index_column(setup_basic_pixel_dataset):
+    # Checking EXE-1184
+    # Pixelator 0.13.0-0.15.2 stored an index column in the parquet files
+    # which showed up as a column when reading the lazy frames. This
+    # caused graph building to fail when working on concatenated
+    # pixeldatasets, since then this column would be propagated to the
+    # edgelist - can this broke the igraph Graph construction since
+    # it assumes that the first two columns should be the vertices
+    dataset, *_ = setup_basic_pixel_dataset
+    dataset.edgelist["index"] = pd.Series(range(len(dataset.edgelist)))
+
+    with pytest.warns(UserWarning):
+        result = dataset.edgelist_lazy
+        assert set(result.columns) == {
+            "sequence",
+            "upib",
+            "upia",
+            "upi_unique_count",
+            "umi",
+            "umi_unique_count",
+            "component",
+            "count",
+            "marker",
+        }
+
+
 def test_copy(setup_basic_pixel_dataset):
     """test_copy."""
     dataset_1, *_ = setup_basic_pixel_dataset