Autopipe Hub
Loading...

sc-downstream-analysis

Single-cell RNA-seq downstream analysis pipeline: preprocessing (normalize, log-transform, HVG selection, scaling, PCA), Leiden clustering, and UMAP visualization. Outputs processed h5ad, UMAP PNG, and cluster assignment CSV.

VERSION 1.0.0
AUTHOR khyeonm
INPUT h5ad
OUTPUT processed h5adUMAP PNGcluster assignments CSV
TOOLS
scanpyleidenalgumap-learn
TAGS
single-cellscRNA-seqclusteringleidenumapdownstream-analysis
configfile: "config.yaml"

rule all:
    input:
        expand("/output/{sample}/{sample}_processed.h5ad", sample=config["samples"]),
        expand("/output/{sample}/{sample}_umap.png", sample=config["samples"]),
        expand("/output/{sample}/{sample}_clusters.csv", sample=config["samples"])

rule preprocess:
    """Normalize, log-transform, select HVGs, scale, and run PCA."""
    input:
        h5ad=lambda wildcards: config["input_files"][wildcards.sample]
    output:
        h5ad="/output/{sample}/{sample}_preprocessed.h5ad"
    params:
        n_top_genes=config.get("n_top_genes", 2000),
        n_pcs=config.get("n_pcs", 50),
        target_sum=config.get("target_sum", 1e4)
    threads: config.get("threads", 4)
    log:
        "/output/{sample}/logs/preprocess.log"
    shell:
        """
        mkdir -p /output/{wildcards.sample}/logs
        python scripts/preprocess.py \
            --input {input.h5ad} \
            --output {output.h5ad} \
            --n-top-genes {params.n_top_genes} \
            --n-pcs {params.n_pcs} \
            --target-sum {params.target_sum} \
            2> {log}
        """

rule cluster:
    """Run Leiden clustering on the preprocessed data."""
    input:
        h5ad=rules.preprocess.output.h5ad
    output:
        h5ad="/output/{sample}/{sample}_clustered.h5ad",
        csv="/output/{sample}/{sample}_clusters.csv"
    params:
        resolution=config.get("leiden_resolution", 0.5),
        n_neighbors=config.get("n_neighbors", 15)
    threads: config.get("threads", 4)
    log:
        "/output/{sample}/logs/cluster.log"
    shell:
        """
        python scripts/cluster.py \
            --input {input.h5ad} \
            --output-h5ad {output.h5ad} \
            --output-csv {output.csv} \
            --resolution {params.resolution} \
            --n-neighbors {params.n_neighbors} \
            2> {log}
        """

rule visualize:
    """Compute UMAP and generate visualization."""
    input:
        h5ad=rules.cluster.output.h5ad
    output:
        h5ad="/output/{sample}/{sample}_processed.h5ad",
        png="/output/{sample}/{sample}_umap.png"
    params:
        min_dist=config.get("umap_min_dist", 0.3),
        spread=config.get("umap_spread", 1.0)
    threads: config.get("threads", 4)
    log:
        "/output/{sample}/logs/visualize.log"
    shell:
        """
        python scripts/visualize.py \
            --input {input.h5ad} \
            --output-h5ad {output.h5ad} \
            --output-png {output.png} \
            --min-dist {params.min_dist} \
            --spread {params.spread} \
            2> {log}
        """
FROM condaforge/mambaforge:latest

RUN conda install -y -c bioconda -c conda-forge \
    snakemake-minimal \
    python=3.10 \
    && conda clean -afy

RUN pip install \
    "scanpy>=1.9" \
    leidenalg \
    python-igraph \
    "umap-learn>=0.5" \
    matplotlib \
    pandas \
    numpy \
    anndata

WORKDIR /pipeline
COPY Snakefile .
COPY config.yaml .
COPY scripts/ scripts/

CMD ["snakemake", "--cores", "4"]
# Sample names (without extension)
samples:
  - 10k

# Input h5ad files mapped by sample name
input_files:
  10k: /input/10k.h5ad

# Preprocessing parameters
target_sum: 10000        # normalize each cell to this total count
n_top_genes: 2000        # number of highly variable genes
n_pcs: 50                # number of PCA components

# Clustering parameters
n_neighbors: 15          # k for neighborhood graph
leiden_resolution: 0.5   # Leiden resolution (higher = more clusters)

# UMAP parameters
umap_min_dist: 0.3
umap_spread: 1.0

# Resources
threads: 4
{
  "@context": "https://w3id.org/ro/crate/1.1/context",
  "@graph": [
    {
      "@id": "ro-crate-metadata.json",
      "@type": "CreativeWork",
      "about": {
        "@id": "./"
      },
      "conformsTo": {
        "@id": "https://w3id.org/ro/crate/1.1"
      }
    },
    {
      "@id": "./",
      "@type": [
        "Dataset",
        "SoftwareSourceCode",
        "ComputationalWorkflow"
      ],
      "creator": [
        {
          "@id": "#author"
        }
      ],
      "dateCreated": "2026-03-10",
      "description": "Single-cell RNA-seq downstream analysis pipeline: preprocessing (normalize, log-transform, HVG selection, scaling, PCA), Leiden clustering, and UMAP visualization. Outputs processed h5ad, UMAP PNG, and cluster assignment CSV.",
      "input": [
        {
          "@id": "#input-h5ad"
        }
      ],
      "keywords": [
        "single-cell",
        "scRNA-seq",
        "clustering",
        "leiden",
        "umap",
        "scanpy",
        "downstream-analysis"
      ],
      "license": {
        "@id": "https://spdx.org/licenses/MIT"
      },
      "name": "sc-downstream-analysis",
      "output": [
        {
          "@id": "#output-h5ad"
        },
        {
          "@id": "#output-umap-png"
        },
        {
          "@id": "#output-clusters-csv"
        }
      ],
      "programmingLanguage": {
        "@id": "#snakemake"
      },
      "softwareRequirements": [
        {
          "@id": "#scanpy"
        },
        {
          "@id": "#leidenalg"
        },
        {
          "@id": "#umap-learn"
        }
      ],
      "version": "1.0.0"
    },
    {
      "@id": "#author",
      "@type": "Person",
      "name": ""
    },
    {
      "@id": "#snakemake",
      "@type": "ComputerLanguage",
      "name": "Snakemake",
      "url": "https://snakemake.readthedocs.io"
    },
    {
      "@id": "#scanpy",
      "@type": "SoftwareApplication",
      "name": "scanpy"
    },
    {
      "@id": "#leidenalg",
      "@type": "SoftwareApplication",
      "name": "leidenalg"
    },
    {
      "@id": "#umap-learn",
      "@type": "SoftwareApplication",
      "name": "umap-learn"
    },
    {
      "@id": "#input-h5ad",
      "@type": "FormalParameter",
      "encodingFormat": "application/x-hdf5",
      "name": "h5ad"
    },
    {
      "@id": "#output-h5ad",
      "@type": "FormalParameter",
      "encodingFormat": "application/x-hdf5",
      "name": "processed h5ad"
    },
    {
      "@id": "#output-umap-png",
      "@type": "FormalParameter",
      "encodingFormat": "image/png",
      "name": "UMAP PNG"
    },
    {
      "@id": "#output-clusters-csv",
      "@type": "FormalParameter",
      "encodingFormat": "text/csv",
      "name": "cluster assignments CSV"
    }
  ],
  "version": "1.0.0"
}
v1.0.0 latest
2026-03-10 · khyeonm