sc-downstream-analysis
Single-cell RNA-seq downstream analysis pipeline: preprocessing (normalize, log-transform, HVG selection, scaling, PCA), Leiden clustering, and UMAP visualization. Outputs processed h5ad, UMAP PNG, and cluster assignment CSV.
VERSION 1.0.0
AUTHOR khyeonm
INPUT h5ad
OUTPUT processed h5adUMAP PNGcluster assignments CSV
configfile: "config.yaml"
rule all:
input:
expand("/output/{sample}/{sample}_processed.h5ad", sample=config["samples"]),
expand("/output/{sample}/{sample}_umap.png", sample=config["samples"]),
expand("/output/{sample}/{sample}_clusters.csv", sample=config["samples"])
rule preprocess:
"""Normalize, log-transform, select HVGs, scale, and run PCA."""
input:
h5ad=lambda wildcards: config["input_files"][wildcards.sample]
output:
h5ad="/output/{sample}/{sample}_preprocessed.h5ad"
params:
n_top_genes=config.get("n_top_genes", 2000),
n_pcs=config.get("n_pcs", 50),
target_sum=config.get("target_sum", 1e4)
threads: config.get("threads", 4)
log:
"/output/{sample}/logs/preprocess.log"
shell:
"""
mkdir -p /output/{wildcards.sample}/logs
python scripts/preprocess.py \
--input {input.h5ad} \
--output {output.h5ad} \
--n-top-genes {params.n_top_genes} \
--n-pcs {params.n_pcs} \
--target-sum {params.target_sum} \
2> {log}
"""
rule cluster:
"""Run Leiden clustering on the preprocessed data."""
input:
h5ad=rules.preprocess.output.h5ad
output:
h5ad="/output/{sample}/{sample}_clustered.h5ad",
csv="/output/{sample}/{sample}_clusters.csv"
params:
resolution=config.get("leiden_resolution", 0.5),
n_neighbors=config.get("n_neighbors", 15)
threads: config.get("threads", 4)
log:
"/output/{sample}/logs/cluster.log"
shell:
"""
python scripts/cluster.py \
--input {input.h5ad} \
--output-h5ad {output.h5ad} \
--output-csv {output.csv} \
--resolution {params.resolution} \
--n-neighbors {params.n_neighbors} \
2> {log}
"""
rule visualize:
"""Compute UMAP and generate visualization."""
input:
h5ad=rules.cluster.output.h5ad
output:
h5ad="/output/{sample}/{sample}_processed.h5ad",
png="/output/{sample}/{sample}_umap.png"
params:
min_dist=config.get("umap_min_dist", 0.3),
spread=config.get("umap_spread", 1.0)
threads: config.get("threads", 4)
log:
"/output/{sample}/logs/visualize.log"
shell:
"""
python scripts/visualize.py \
--input {input.h5ad} \
--output-h5ad {output.h5ad} \
--output-png {output.png} \
--min-dist {params.min_dist} \
--spread {params.spread} \
2> {log}
"""FROM condaforge/mambaforge:latest
RUN conda install -y -c bioconda -c conda-forge \
snakemake-minimal \
python=3.10 \
&& conda clean -afy
RUN pip install \
"scanpy>=1.9" \
leidenalg \
python-igraph \
"umap-learn>=0.5" \
matplotlib \
pandas \
numpy \
anndata
WORKDIR /pipeline
COPY Snakefile .
COPY config.yaml .
COPY scripts/ scripts/
CMD ["snakemake", "--cores", "4"]# Sample names (without extension)
samples:
- 10k
# Input h5ad files mapped by sample name
input_files:
10k: /input/10k.h5ad
# Preprocessing parameters
target_sum: 10000 # normalize each cell to this total count
n_top_genes: 2000 # number of highly variable genes
n_pcs: 50 # number of PCA components
# Clustering parameters
n_neighbors: 15 # k for neighborhood graph
leiden_resolution: 0.5 # Leiden resolution (higher = more clusters)
# UMAP parameters
umap_min_dist: 0.3
umap_spread: 1.0
# Resources
threads: 4{
"@context": "https://w3id.org/ro/crate/1.1/context",
"@graph": [
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": "./"
},
"conformsTo": {
"@id": "https://w3id.org/ro/crate/1.1"
}
},
{
"@id": "./",
"@type": [
"Dataset",
"SoftwareSourceCode",
"ComputationalWorkflow"
],
"creator": [
{
"@id": "#author"
}
],
"dateCreated": "2026-03-10",
"description": "Single-cell RNA-seq downstream analysis pipeline: preprocessing (normalize, log-transform, HVG selection, scaling, PCA), Leiden clustering, and UMAP visualization. Outputs processed h5ad, UMAP PNG, and cluster assignment CSV.",
"input": [
{
"@id": "#input-h5ad"
}
],
"keywords": [
"single-cell",
"scRNA-seq",
"clustering",
"leiden",
"umap",
"scanpy",
"downstream-analysis"
],
"license": {
"@id": "https://spdx.org/licenses/MIT"
},
"name": "sc-downstream-analysis",
"output": [
{
"@id": "#output-h5ad"
},
{
"@id": "#output-umap-png"
},
{
"@id": "#output-clusters-csv"
}
],
"programmingLanguage": {
"@id": "#snakemake"
},
"softwareRequirements": [
{
"@id": "#scanpy"
},
{
"@id": "#leidenalg"
},
{
"@id": "#umap-learn"
}
],
"version": "1.0.0"
},
{
"@id": "#author",
"@type": "Person",
"name": ""
},
{
"@id": "#snakemake",
"@type": "ComputerLanguage",
"name": "Snakemake",
"url": "https://snakemake.readthedocs.io"
},
{
"@id": "#scanpy",
"@type": "SoftwareApplication",
"name": "scanpy"
},
{
"@id": "#leidenalg",
"@type": "SoftwareApplication",
"name": "leidenalg"
},
{
"@id": "#umap-learn",
"@type": "SoftwareApplication",
"name": "umap-learn"
},
{
"@id": "#input-h5ad",
"@type": "FormalParameter",
"encodingFormat": "application/x-hdf5",
"name": "h5ad"
},
{
"@id": "#output-h5ad",
"@type": "FormalParameter",
"encodingFormat": "application/x-hdf5",
"name": "processed h5ad"
},
{
"@id": "#output-umap-png",
"@type": "FormalParameter",
"encodingFormat": "image/png",
"name": "UMAP PNG"
},
{
"@id": "#output-clusters-csv",
"@type": "FormalParameter",
"encodingFormat": "text/csv",
"name": "cluster assignments CSV"
}
],
"version": "1.0.0"
}