germline-variant-calling
Calls germline variants from BAM files using GATK HaplotypeCaller in GVCF mode. Supports multiple samples in parallel, producing per-sample g.VCF files ready for joint genotyping.
VERSION 1.0.0
AUTHOR khyeonm
INPUT bamreference fasta
OUTPUT g.vcf.gzg.vcf.gz.tbi
configfile: "config.yaml"
SAMPLES = config["samples"]
REFERENCE = config["reference"]
REF_DICT = "/output/ref/" + REFERENCE.split("/")[-1].replace(".fa", ".dict").replace(".fasta", ".dict")
REF_COPY = "/output/ref/" + REFERENCE.split("/")[-1]
rule all:
input:
expand("/output/{sample}.g.vcf.gz", sample=SAMPLES),
expand("/output/{sample}.g.vcf.gz.tbi", sample=SAMPLES)
rule prepare_reference:
"""Copy reference to writable location and create sequence dictionary."""
input:
ref=REFERENCE,
fai=REFERENCE + ".fai"
output:
ref=REF_COPY,
fai=REF_COPY + ".fai",
dic=REF_DICT
log:
"/output/logs/prepare_reference.log"
shell:
"""
mkdir -p /output/ref
cp {input.ref} {output.ref}
cp {input.fai} {output.fai}
gatk CreateSequenceDictionary -R {output.ref} -O {output.dic} 2> {log}
"""
rule haplotype_caller:
"""Run GATK HaplotypeCaller in GVCF mode per sample."""
input:
bam="/input/{sample}.bam",
bai="/input/{sample}.bam.bai",
ref=REF_COPY,
dic=REF_DICT
output:
gvcf="/output/{sample}.g.vcf.gz",
tbi="/output/{sample}.g.vcf.gz.tbi"
threads: config.get("threads", 4)
params:
java_opts=config.get("java_opts", "-Xmx8g"),
extra=config.get("haplotypecaller_extra", "")
log:
"/output/logs/{sample}_haplotypecaller.log"
shell:
"""
gatk --java-options "{params.java_opts}" HaplotypeCaller \
-R {input.ref} \
-I {input.bam} \
-O {output.gvcf} \
-ERC GVCF \
--native-pair-hmm-threads {threads} \
{params.extra} \
2> {log}
"""FROM condaforge/mambaforge:latest
RUN mamba install -y -c bioconda -c conda-forge \
snakemake-minimal \
gatk4=4.5.0.0 \
samtools=1.20 \
&& mamba clean -afy
WORKDIR /pipeline
COPY Snakefile .
COPY config.yaml .
CMD ["snakemake", "--help"]# Required: list of sample names (without .bam extension)
samples:
- NA12891_S1
# Required: path to reference genome FASTA (mounted at runtime via /input)
reference: "/input/hg19_ref/hg19_human.fa"
# Java heap size for GATK
java_opts: "-Xmx8g"
# Number of threads for HaplotypeCaller native pair-HMM
threads: 4
# Optional: extra HaplotypeCaller arguments (e.g., "--dbsnp /input/dbsnp.vcf.gz")
haplotypecaller_extra: ""{
"@context": "https://w3id.org/ro/crate/1.1/context",
"@graph": [
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": "./"
},
"conformsTo": {
"@id": "https://w3id.org/ro/crate/1.1"
}
},
{
"@id": "./",
"@type": [
"Dataset",
"SoftwareSourceCode",
"ComputationalWorkflow"
],
"creator": [
{
"@id": "#author"
}
],
"dateCreated": "2026-03-10",
"description": "Calls germline variants from BAM files using GATK HaplotypeCaller in GVCF mode. Supports multiple samples in parallel, producing per-sample g.VCF files ready for joint genotyping.",
"input": [
{
"@id": "#input-bam"
},
{
"@id": "#input-reference"
}
],
"isBasedOn": {
"@id": ""
},
"keywords": [
"germline",
"variant-calling",
"GATK",
"HaplotypeCaller",
"GVCF",
"WGS",
"WES",
"human",
"SNP",
"indel"
],
"license": {
"@id": "https://spdx.org/licenses/MIT"
},
"name": "germline-variant-calling",
"output": [
{
"@id": "#output-gvcf"
},
{
"@id": "#output-tbi"
}
],
"programmingLanguage": {
"@id": "#snakemake"
},
"sdPublisher": {
"@id": "https://hub.autopipe.org"
},
"softwareRequirements": [
{
"@id": "#gatk4"
},
{
"@id": "#samtools"
}
],
"version": "1.0.0"
},
{
"@id": "#author",
"@type": "Person",
"name": "khyeonmin"
},
{
"@id": "#snakemake",
"@type": "ComputerLanguage",
"name": "Snakemake",
"url": "https://snakemake.readthedocs.io"
},
{
"@id": "#gatk4",
"@type": "SoftwareApplication",
"name": "GATK4 HaplotypeCaller"
},
{
"@id": "#samtools",
"@type": "SoftwareApplication",
"name": "samtools"
},
{
"@id": "#input-bam",
"@type": "FormalParameter",
"encodingFormat": "application/x-bam",
"name": "bam"
},
{
"@id": "#input-reference",
"@type": "FormalParameter",
"encodingFormat": "text/x-fasta",
"name": "reference fasta"
},
{
"@id": "#output-gvcf",
"@type": "FormalParameter",
"encodingFormat": "text/x-vcf",
"name": "g.vcf.gz"
},
{
"@id": "#output-tbi",
"@type": "FormalParameter",
"encodingFormat": "application/octet-stream",
"name": "g.vcf.gz.tbi"
}
],
"version": "1.0.0"
}# germline-variant-calling
Calls germline variants from BAM files using GATK HaplotypeCaller in GVCF mode. Supports multiple samples in parallel, producing per-sample `.g.vcf.gz` files ready for joint genotyping.
## Inputs
| File | Description |
|------|-------------|
| `{sample}.bam` + `.bai` | Sorted, indexed BAM files |
| `reference.fa` + `.fai` + `.dict` | Reference genome FASTA with index and sequence dictionary |
## Outputs
| File | Description |
|------|-------------|
| `{sample}.g.vcf.gz` | Per-sample GVCF with raw variant calls |
| `{sample}.g.vcf.gz.tbi` | Tabix index for the GVCF |
## Configuration (`config.yaml`)
```yaml
samples:
- NA12891_S1 # list sample names (without .bam extension)
reference: "/input/hg19_human.fa" # path to reference FASTA inside container
java_opts: "-Xmx8g" # Java heap size
threads: 4 # threads for native pair-HMM
haplotypecaller_extra: "" # extra HaplotypeCaller flags
```
## Running
```bash
# Build
docker build -t germline-variant-calling:1.0.0 .
# Run
docker run --rm \
-v /path/to/data:/input:ro \
-v /path/to/output:/output \
germline-variant-calling:1.0.0 \
snakemake --cores 4
```