Comparing Modlyn & Scanpy feature selection methods

!pip install 'modlyn[dev]'
!lamin init --storage test-modlyn
import lamindb as ln
import modlyn as mn
import scanpy as sc
import pandas as pd
import seaborn as sns

sns.set_theme()
%config InlineBackend.figure_formats = ['svg']
ln.track()
# Configuration: switch between in-memory and Dask loader
USE_DASK = True  # set False to use in-memory path
ZARR_UID = "1xSHIdfBjfUdxKHm0000"  # example UID; change as needed
LABEL_COL = "cell_line"

# Dask runtime
DASK_DATASET_TYPE = "arrayloaders-dasd"  # accepted alias (normalized internally)
BATCH_SIZE = 256
N_CHUNKS = 8
DASK_SCHEDULER = "threads"

Using a custom Dask data loader

Set USE_DASK = True and provide a zarr ZARR_UID from laminlabs/arrayloader-benchmarks. The loader auto-detects whether the cached path is a single zarr store or a directory of shard stores (*.zarr) and selects the right reader. For quick runs, we cap steps with max_steps in the training call.

from pathlib import Path
import lamindb as ln

if USE_DASK:
    artifact = ln.Artifact.using("laminlabs/arrayloader-benchmarks").get(ZARR_UID)
    store_path = Path(artifact.cache())
    if not store_path.is_dir():
        raise ValueError(f"ZARR_UID must cache to a directory, got: {store_path}")

    # Decide between a directory of shards (*.zarr) vs a single zarr store
    has_shards = any(child.name.endswith(".zarr") for child in store_path.iterdir())

    try:
        from arrayloaders.io.dask_loader import read_lazy_store
    except Exception:
        read_lazy_store = None
    from arrayloaders.io import read_lazy as read_single_store

    if has_shards and read_lazy_store is not None:
        adata = read_lazy_store(store_path, obs_columns=[LABEL_COL])
    else:
        # Single zarr store
        adata = read_single_store(store_path, obs_columns=[LABEL_COL])
else:
    # Example H5AD path (keep your current artifact if you prefer)
    artifact = ln.Artifact.using("laminlabs/arrayloader-benchmarks").get(
        "JNaxQe8zbljesdbK0000"
    )
    adata = artifact.load()
    sc.pp.log1p(adata)

print("adata:", adata.shape)

Prepare dataset

keep = adata.obs["cell_line"].value_counts().loc[lambda x: x > 3].index
adata = adata[adata.obs["cell_line"].isin(keep)].copy()
adata
adata.obs["cell_line"].value_counts().tail()

Train LogReg with Modlyn

logreg = mn.models.SimpleLogReg(
    adata=adata,
    label_column=LABEL_COL,
    learning_rate=1e-1,
    weight_decay=1e-3,
)

fit_kwargs = {
    "adata_train": adata,
    "adata_val": None,
    "train_dataloader_kwargs": {
        "batch_size": BATCH_SIZE,
        "drop_last": False,
        "num_workers": 0,
    },
    "max_epochs": 1,
    "num_sanity_val_steps": 0,
    "log_every_n_steps": 1,
    "max_steps": 50,
}

if USE_DASK:
    fit_kwargs.update(
        {
            "dataset_type": DASK_DATASET_TYPE,
            "n_chunks": N_CHUNKS,
            "dask_scheduler": DASK_SCHEDULER,
        }
    )

# logreg.fit(**fit_kwargs)
logreg.fit(
    adata_train=adata,
    adata_val=adata,  # reuse the lazy dataset so val has batches
    train_dataloader_kwargs={
        "batch_size": BATCH_SIZE,
        "drop_last": False,
        "num_workers": 0,
    },
    dataset_type=DASK_DATASET_TYPE,
    n_chunks=N_CHUNKS,
    dask_scheduler=DASK_SCHEDULER,
    max_epochs=1,
    num_sanity_val_steps=0,
    log_every_n_steps=1,
    max_steps=50,
)


print("dataset_type:", getattr(logreg.datamodule, "dataset_type", "in-memory"))
print("train_dataset:", type(logreg.datamodule.train_dataloader().dataset).__name__)
logreg.plot_losses()
# eval subset
adata_eval = adata[:10000]
adata_eval = adata_eval.to_memory() if hasattr(adata_eval, "to_memory") else adata_eval

if hasattr(adata_eval.X, "compute"):
    adata_eval.X = adata_eval.X.compute()

logreg.plot_classification_report(adata_eval)

Get features scores of different methods

df_modlyn_logreg = logreg.get_weights()
df_modlyn_logreg.head()
sc.tl.rank_genes_groups(adata, "cell_line", method="logreg", key_added="sc_logreg")
df_scanpy_logreg = sc.get.rank_genes_groups_df(
    adata, group=None, key="sc_logreg"
).pivot(index="group", columns="names", values="scores")
df_scanpy_logreg.attrs["method_name"] = "scanpy_logreg"
df_scanpy_logreg.head()
sc.tl.rank_genes_groups(adata, "cell_line", method="wilcoxon", key_added="sc_wilcoxon")
df_scanpy_wilcoxon = sc.get.rank_genes_groups_df(
    adata, group=None, key="sc_wilcoxon"
).pivot(index="group", columns="names", values="scores")
df_scanpy_wilcoxon.attrs["method_name"] = "scanpy_wilcoxon"
df_scanpy_wilcoxon.head()

Compare feature selection results

compare = mn.eval.CompareScoresJaccard(
    [df_modlyn_logreg, df_scanpy_logreg, df_scanpy_wilcoxon], n_top_values=[5, 10, 25]
)
compare.plot_heatmaps()
compare.compute_jaccard_comparison()
compare.plot_jaccard_comparison()
ln.finish()