Skip to content

Commit

Permalink
Merge pull request #28 from PixelgenTechnologies/feature/exe-1184-bug…
Browse files Browse the repository at this point in the history
…-with-index-column

Fixes to pixeldataset aggregation and parquet file indexeing
  • Loading branch information
johandahlberg authored Oct 23, 2023
2 parents e9f544e + 563f14d commit eb15ed2
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 3 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [UNRELEASED] - ...

### Fixed

* Fixed broken pixeldataset aggregation for more than two samples.
* Fixed a bug in graph generation caused by accidentally writing the index to the parquet file.
For backwards compatiblity, if there is an column named `index`` in the edgelist, this
will be removed and the user will get a warning indicating that this has happened.

## [0.15.1] - 2023-10-18

### Fixed
Expand Down
19 changes: 16 additions & 3 deletions src/pixelator/pixeldataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def _concatenate_edgelists(datasets, sample_names):
)
concatenated = concatenated.collect()

for idx, subsequent in enumerate(datasets[:1], start=1):
for idx, subsequent in enumerate(datasets[1:], start=1):
concatenated = concatenated.extend(
subsequent.edgelist_lazy.collect().with_columns(
sample=pl.lit(sample_names[idx], dtype=pl.Categorical)
Expand All @@ -97,6 +97,7 @@ def _concatenate_edgelists(datasets, sample_names):
pl.col("component"), pl.col("sample"), separator="_"
)
)

return concatenated


Expand Down Expand Up @@ -137,6 +138,8 @@ def simple_aggregate(
raise AssertionError(
"There must be as many sample names provided as there are dataset"
)
if not len(set(sample_names)) == len(sample_names):
raise AssertionError("All provided sample names must be unique")

all_var_identical = all(
map(
Expand Down Expand Up @@ -168,6 +171,7 @@ def _add_sample_name_as_obs_col(adata, name):
edgelists = _enforce_edgelist_types(
_concatenate_edgelists(datasets, sample_names).to_pandas()
)

polarizations = pd.concat(
_get_attr_and_index_by_component(
"polarization", datasets=datasets, sample_names=sample_names
Expand Down Expand Up @@ -501,7 +505,9 @@ class PixelFileParquetFormatSpec(PixelFileFormatSpec):
@staticmethod
def serialize_dataframe(dataframe: pd.DataFrame, path: PathType) -> None:
"""Serialize a dataframe from the give path."""
dataframe.to_parquet(path, engine="fastparquet", compression="zstd")
dataframe.to_parquet(
path, engine="fastparquet", compression="zstd", index=False
)

@staticmethod
def deserialize_dataframe(path: PathType, key: str) -> pd.DataFrame:
Expand Down Expand Up @@ -640,7 +646,14 @@ def edgelist(self, value: pd.DataFrame) -> None:
@property
def edgelist_lazy(self) -> pl.LazyFrame:
"""Get the edge list as a lazy dataframe."""
return self._backend.edgelist_lazy
lz_edgelist = self._backend.edgelist_lazy
if "index" in lz_edgelist.columns:
warnings.warn(
"A column called `index` was identified in your edgelist. "
"This will be removed."
)
lz_edgelist = lz_edgelist.drop("index")
return lz_edgelist

@property
def polarization(self) -> Optional[pd.DataFrame]:
Expand Down
118 changes: 118 additions & 0 deletions tests/test_pixeldataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import numpy as np
import pandas as pd
import polars as pl
import pytest
from anndata import AnnData
from numpy.testing import assert_array_equal
Expand Down Expand Up @@ -70,6 +71,22 @@ def test_pixel_file_parquet_format_spec_can_save(setup_basic_pixel_dataset, tmp_
assert file_target.is_file()


def test_pixel_file_parquet_no_index_in_parquet_files(tmp_path):
# Checking EXE-1184
# We do not want an index added to the parquet file, regardless of if they
# read with pandas or polars
file_target = tmp_path / "df.parquet"
df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=["X", "Y", "Z"])
PixelFileParquetFormatSpec().serialize_dataframe(df, path=file_target)
assert file_target.is_file()
res1 = pd.read_parquet(file_target)
assert set(res1.columns) == {"A", "B"}
res2 = pl.scan_parquet(file_target)
assert set(res2.columns) == {"A", "B"}
res3 = pl.read_parquet(file_target)
assert set(res3.columns) == {"A", "B"}


def test_pixel_file_csv_format_spec_can_save(setup_basic_pixel_dataset, tmp_path):
"""test_pixel_file_csv_format_spec_can_save."""
dataset, *_ = setup_basic_pixel_dataset
Expand Down Expand Up @@ -715,6 +732,107 @@ def test_filter_should_return_proper_typed_edgelist_data(setup_basic_pixel_datas
result.graph(result.adata.obs.index[0])


def test_on_aggregation_not_passing_unique_sample_names_should_raise(
tmp_path,
setup_basic_pixel_dataset,
):
dataset_1, *_ = setup_basic_pixel_dataset
dataset_2 = dataset_1.copy()

file_target_1 = tmp_path / "dataset_1.pxl"
dataset_1.save(str(file_target_1))
file_target_2 = tmp_path / "dataset_2.pxl"
dataset_2.save(str(file_target_2))

with pytest.raises(AssertionError):
simple_aggregate(
sample_names=["sample1", "sample1"],
datasets=[
read(file_target_1),
read(file_target_2),
],
)


def test_aggregation_all_samples_show_up(
tmp_path,
setup_basic_pixel_dataset,
):
dataset_1, *_ = setup_basic_pixel_dataset
dataset_2 = dataset_1.copy()
dataset_3 = dataset_1.copy()
dataset_4 = dataset_1.copy()

file_target_1 = tmp_path / "dataset_1.pxl"
dataset_1.save(str(file_target_1))
file_target_2 = tmp_path / "dataset_2.pxl"
dataset_2.save(str(file_target_2))
file_target_3 = tmp_path / "dataset_3.pxl"
dataset_3.save(str(file_target_3))
file_target_4 = tmp_path / "dataset_4.pxl"
dataset_4.save(str(file_target_4))

result = simple_aggregate(
sample_names=["sample1", "sample2", "sample3", "sample4"],
datasets=[
read(file_target_1),
read(file_target_2),
read(file_target_3),
read(file_target_4),
],
)
assert set(result.edgelist["sample"].unique()) == {
"sample1",
"sample2",
"sample3",
"sample4",
}
assert set(result.polarization["sample"].unique()) == {
"sample1",
"sample2",
"sample3",
"sample4",
}
assert set(result.colocalization["sample"].unique()) == {
"sample1",
"sample2",
"sample3",
"sample4",
}
assert set(result.adata.obs["sample"].unique()) == {
"sample1",
"sample2",
"sample3",
"sample4",
}


def test_lazy_edgelist_should_warn_and_rm_on_index_column(setup_basic_pixel_dataset):
# Checking EXE-1184
# Pixelator 0.13.0-0.15.2 stored an index column in the parquet files
# which showed up as a column when reading the lazy frames. This
# caused graph building to fail when working on concatenated
# pixeldatasets, since then this column would be propagated to the
# edgelist - can this broke the igraph Graph construction since
# it assumes that the first two columns should be the vertices
dataset, *_ = setup_basic_pixel_dataset
dataset.edgelist["index"] = pd.Series(range(len(dataset.edgelist)))

with pytest.warns(UserWarning):
result = dataset.edgelist_lazy
assert set(result.columns) == {
"sequence",
"upib",
"upia",
"upi_unique_count",
"umi",
"umi_unique_count",
"component",
"count",
"marker",
}


def test_copy(setup_basic_pixel_dataset):
"""test_copy."""
dataset_1, *_ = setup_basic_pixel_dataset
Expand Down

0 comments on commit eb15ed2

Please sign in to comment.