Autopipe Hub
Loading...

germline-variant-calling

Calls germline variants from BAM files using GATK HaplotypeCaller in GVCF mode. Supports multiple samples in parallel, producing per-sample g.VCF files ready for joint genotyping.

VERSION 1.0.0
AUTHOR khyeonm
INPUT bamreference fasta
OUTPUT g.vcf.gzg.vcf.gz.tbi
TOOLS
GATK4 HaplotypeCallersamtools
TAGS
germlinevariant-callingGATKHaplotypeCallerGVCFWGSWEShumanSNPindel
configfile: "config.yaml"

SAMPLES = config["samples"]
REFERENCE = config["reference"]
REF_DICT = "/output/ref/" + REFERENCE.split("/")[-1].replace(".fa", ".dict").replace(".fasta", ".dict")
REF_COPY = "/output/ref/" + REFERENCE.split("/")[-1]

rule all:
    input:
        expand("/output/{sample}.g.vcf.gz", sample=SAMPLES),
        expand("/output/{sample}.g.vcf.gz.tbi", sample=SAMPLES)

rule prepare_reference:
    """Copy reference to writable location and create sequence dictionary."""
    input:
        ref=REFERENCE,
        fai=REFERENCE + ".fai"
    output:
        ref=REF_COPY,
        fai=REF_COPY + ".fai",
        dic=REF_DICT
    log:
        "/output/logs/prepare_reference.log"
    shell:
        """
        mkdir -p /output/ref
        cp {input.ref} {output.ref}
        cp {input.fai} {output.fai}
        gatk CreateSequenceDictionary -R {output.ref} -O {output.dic} 2> {log}
        """

rule haplotype_caller:
    """Run GATK HaplotypeCaller in GVCF mode per sample."""
    input:
        bam="/input/{sample}.bam",
        bai="/input/{sample}.bam.bai",
        ref=REF_COPY,
        dic=REF_DICT
    output:
        gvcf="/output/{sample}.g.vcf.gz",
        tbi="/output/{sample}.g.vcf.gz.tbi"
    threads: config.get("threads", 4)
    params:
        java_opts=config.get("java_opts", "-Xmx8g"),
        extra=config.get("haplotypecaller_extra", "")
    log:
        "/output/logs/{sample}_haplotypecaller.log"
    shell:
        """
        gatk --java-options "{params.java_opts}" HaplotypeCaller \
            -R {input.ref} \
            -I {input.bam} \
            -O {output.gvcf} \
            -ERC GVCF \
            --native-pair-hmm-threads {threads} \
            {params.extra} \
            2> {log}
        """
FROM condaforge/mambaforge:latest

RUN mamba install -y -c bioconda -c conda-forge \
    snakemake-minimal \
    gatk4=4.5.0.0 \
    samtools=1.20 \
    && mamba clean -afy

WORKDIR /pipeline
COPY Snakefile .
COPY config.yaml .

CMD ["snakemake", "--help"]
# Required: list of sample names (without .bam extension)
samples:
  - NA12891_S1

# Required: path to reference genome FASTA (mounted at runtime via /input)
reference: "/input/hg19_ref/hg19_human.fa"

# Java heap size for GATK
java_opts: "-Xmx8g"

# Number of threads for HaplotypeCaller native pair-HMM
threads: 4

# Optional: extra HaplotypeCaller arguments (e.g., "--dbsnp /input/dbsnp.vcf.gz")
haplotypecaller_extra: ""
{
  "@context": "https://w3id.org/ro/crate/1.1/context",
  "@graph": [
    {
      "@id": "ro-crate-metadata.json",
      "@type": "CreativeWork",
      "about": {
        "@id": "./"
      },
      "conformsTo": {
        "@id": "https://w3id.org/ro/crate/1.1"
      }
    },
    {
      "@id": "./",
      "@type": [
        "Dataset",
        "SoftwareSourceCode",
        "ComputationalWorkflow"
      ],
      "creator": [
        {
          "@id": "#author"
        }
      ],
      "dateCreated": "2026-03-10",
      "description": "Calls germline variants from BAM files using GATK HaplotypeCaller in GVCF mode. Supports multiple samples in parallel, producing per-sample g.VCF files ready for joint genotyping.",
      "input": [
        {
          "@id": "#input-bam"
        },
        {
          "@id": "#input-reference"
        }
      ],
      "isBasedOn": {
        "@id": ""
      },
      "keywords": [
        "germline",
        "variant-calling",
        "GATK",
        "HaplotypeCaller",
        "GVCF",
        "WGS",
        "WES",
        "human",
        "SNP",
        "indel"
      ],
      "license": {
        "@id": "https://spdx.org/licenses/MIT"
      },
      "name": "germline-variant-calling",
      "output": [
        {
          "@id": "#output-gvcf"
        },
        {
          "@id": "#output-tbi"
        }
      ],
      "programmingLanguage": {
        "@id": "#snakemake"
      },
      "sdPublisher": {
        "@id": "https://hub.autopipe.org"
      },
      "softwareRequirements": [
        {
          "@id": "#gatk4"
        },
        {
          "@id": "#samtools"
        }
      ],
      "version": "1.0.0"
    },
    {
      "@id": "#author",
      "@type": "Person",
      "name": "khyeonmin"
    },
    {
      "@id": "#snakemake",
      "@type": "ComputerLanguage",
      "name": "Snakemake",
      "url": "https://snakemake.readthedocs.io"
    },
    {
      "@id": "#gatk4",
      "@type": "SoftwareApplication",
      "name": "GATK4 HaplotypeCaller"
    },
    {
      "@id": "#samtools",
      "@type": "SoftwareApplication",
      "name": "samtools"
    },
    {
      "@id": "#input-bam",
      "@type": "FormalParameter",
      "encodingFormat": "application/x-bam",
      "name": "bam"
    },
    {
      "@id": "#input-reference",
      "@type": "FormalParameter",
      "encodingFormat": "text/x-fasta",
      "name": "reference fasta"
    },
    {
      "@id": "#output-gvcf",
      "@type": "FormalParameter",
      "encodingFormat": "text/x-vcf",
      "name": "g.vcf.gz"
    },
    {
      "@id": "#output-tbi",
      "@type": "FormalParameter",
      "encodingFormat": "application/octet-stream",
      "name": "g.vcf.gz.tbi"
    }
  ],
  "version": "1.0.0"
}
# germline-variant-calling

Calls germline variants from BAM files using GATK HaplotypeCaller in GVCF mode. Supports multiple samples in parallel, producing per-sample `.g.vcf.gz` files ready for joint genotyping.

## Inputs

| File | Description |
|------|-------------|
| `{sample}.bam` + `.bai` | Sorted, indexed BAM files |
| `reference.fa` + `.fai` + `.dict` | Reference genome FASTA with index and sequence dictionary |

## Outputs

| File | Description |
|------|-------------|
| `{sample}.g.vcf.gz` | Per-sample GVCF with raw variant calls |
| `{sample}.g.vcf.gz.tbi` | Tabix index for the GVCF |

## Configuration (`config.yaml`)

```yaml
samples:
  - NA12891_S1      # list sample names (without .bam extension)

reference: "/input/hg19_human.fa"   # path to reference FASTA inside container

java_opts: "-Xmx8g"          # Java heap size
threads: 4                   # threads for native pair-HMM
haplotypecaller_extra: ""    # extra HaplotypeCaller flags
```

## Running

```bash
# Build
docker build -t germline-variant-calling:1.0.0 .

# Run
docker run --rm \
  -v /path/to/data:/input:ro \
  -v /path/to/output:/output \
  germline-variant-calling:1.0.0 \
  snakemake --cores 4
```
v1.0.0 latest
2026-03-10 · khyeonm