diff --git a/.gitattributes b/.gitattributes index e87cc09ed..133928a1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -17,3 +17,8 @@ bin/check-job-alive !text !filter !merge !diff *.conf !text !filter !merge !diff .fa filter=lfs diff=lfs merge=lfs -text *.fa filter=lfs diff=lfs merge=lfs -text +*.mmi filter=lfs diff=lfs merge=lfs -text +*.gbz filter=lfs diff=lfs merge=lfs -text +*.dist filter=lfs diff=lfs merge=lfs -text +*.min filter=lfs diff=lfs merge=lfs -text +*.zipcodes filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/sprocket-test.yaml b/.github/workflows/sprocket-test.yaml index 5800f6887..0c022f44b 100644 --- a/.github/workflows/sprocket-test.yaml +++ b/.github/workflows/sprocket-test.yaml @@ -26,11 +26,11 @@ jobs: - uses: actions/checkout@v4 with: lfs: true - - name: Update Rust - run: rustup update stable && rustup default stable - - name: Build Sprocket + - name: Install Sprocket run: | - cargo install sprocket --locked + SPROCKET_URL=$(curl -sSL https://api.github.com/repos/stjude-rust-labs/sprocket/releases/latest \ + | jq -r '.assets[] | select(.name | test("x86_64-unknown-linux-gnu.tar.gz")) | .browser_download_url') + curl -sSL "$SPROCKET_URL" | tar -xz -C /usr/local/bin sprocket - name: Update containers run: | ./developer_scripts/update_container_tags.sh ${GITHUB_REF##*/} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..36a47bf5b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,141 @@ +# AGENTS.md + +Agent guidance for the St. Jude Cloud bioinformatics workflows repository. + +## What this repo is + +A collection of WDL 1.1 pipelines and tool wrappers for genomic analysis. Primary artifacts are `.wdl` files; there is no traditional build step. Python and R exist only as scripts embedded in Docker images under `scripts/`. + +## Layout + +``` +workflows/ # End-to-end pipelines (rnaseq/, dnaseq/, chipseq/, methylation/, etc.) +tools/ # Individual WDL task wrappers (one tool per file) +data_structures/ # WDL struct definitions +docker/ # Dockerfiles + package.json version metadata per image +scripts/ # Python and R scripts (not standalone; embedded in Docker images) +tests/ # pytest-workflow YAML test definitions + tools/ # Per-tool test YAMLs + workflows/ # Per-workflow test YAMLs + input_json/ # Input JSON fixtures + input/ # Binary fixtures (BAMs, FASTQs, etc.) — tracked via git-lfs +template/ # task-examples.wdl (canonical patterns) + common-parameter-meta.txt +developer_scripts/ + run_sprocket_or_miniwdl.sh # Unified test runner + update_container_tags.sh # Rewrites container tags for branch testing (do not commit) +``` + +## Developer commands + +### Setup + +```bash +uv sync # Python deps (canonical; use this, not pip) +git lfs pull # Required after clone to get test fixture files +``` + +### WDL lint / format / check (Sprocket) + +```bash +sprocket lint . # Lint all WDL +sprocket format tools/bwa.wdl # Format a file +sprocket check . # Thorough check (validates inputs, etc.) +sprocket validate # Validate inputs against a workflow +``` + +`sprocket.toml` disables the `ContainerUri` rule and sets `deny_notes = true` (notes in WDL cause check failure). + +### Python (scripts/ only) + +```bash +uv run ruff format --check --diff scripts/ +uv run ruff format scripts/ +uv run ruff check scripts/ +``` + +### R (scripts/ only) + +```r +styler::style_dir() # format +lintr::lint_dir() # lint +``` + +### Tests + +```bash +# All tests +uv run pytest --kwdof --wt $(nproc) + +# Single test by tag +pytest --tag bwa + +# Single test by name +pytest -k bwa_aln + +# Choose runner (defaults to sprocket) +RUNNER=miniwdl pytest --tag bwa +``` + +`pytest.ini` always injects `--git-aware --symlink`. Use `--kwdof` to keep outputs on failure. + +Every test calls `./developer_scripts/run_sprocket_or_miniwdl.sh` internally — do not call runners directly in test commands. + +### Run a WDL task directly (outside pytest) + +```bash +# sprocket +sprocket run --output-dir output --target bwa_aln tools/bwa.wdl + +# miniwdl +miniwdl run --task bwa_aln --verbose --dir output/. -i tests/tools/input_json/bwa_aln.json tools/bwa.wdl +``` + +## CI pipeline + +All jobs trigger on push. Key facts: +- `reference` and `slow` tags are **excluded** from the CI test matrix. +- CI deletes `slow`-tagged tests from YAML files in-place before running — do not replicate this destructively locally. +- CI builds Docker images and runs `update_container_tags.sh` to rewrite tags before pytest — do not commit the rewritten WDL files. +- Every test runs against both `sprocket` and `miniwdl` runners as a matrix. +- `sprocket` is built from source (`cargo install sprocket --locked`) in CI. +- `requirements-ci.txt` pins older versions than `pyproject.toml`; use `uv` locally. + +## WDL conventions (non-obvious) + +- All WDL files must be `version 1.1`. +- All tasks must include `set -euo pipefail` when using pipes or multiple commands. +- Multi-core tasks: accept `use_all_cores: Boolean = false` and `ncpu: Int = 2`; use `$(nproc)` when `use_all_cores` is true. `use_all_cores` must be the last Boolean in the input block; `ncpu` precedes memory/disk inputs. +- Resource inputs (`ncpu`, `modify_memory_gb`, `modify_disk_size_gb`) must be overridable. +- Single output → `outfile_name`; multiple outputs or extension matters → `prefix`. +- BAM/BAI companion pairs must be localized via `ln -s` to CWD and cleaned up with `rm` at task end. +- Scripts go in `scripts/`; never embed Python/R directly in WDL `command` blocks. +- Imports within the repo must use **relative paths**. External imports must pin a **tagged release** (not `main`/`master`) — enforced by `pull-check.yaml`. +- Deprecated tasks: add `deprecated: true` and `warning: "**[DEPRECATED]**..."` to `meta`. Never add `deprecated: false`. +- Update the relevant `CHANGELOG.md` when modifying any WDL under a subdirectory. + +## Parameter meta conventions + +Prefer names from `template/common-parameter-meta.txt`. Key ones: +- `bam` (not `input_bam`, `in_bam`) +- `bam_index` (not `bai`) +- `read_one_fastq_gz` / `read_two_fastq_gz` (not `read1`/`read2`) +- `paired_end` (not `paired`) + +## Docker image versioning + +Each image lives in `docker//` with `Dockerfile` and `package.json`. `version` = underlying tool version; `revision` starts at `0`, increments for image-only changes, resets to `0` on tool version upgrades. Prefer BioContainers images over creating new custom images. + +## Testing quirks + +- Binary fixtures in `tests/input/` are in **git-lfs** — run `git lfs pull` before testing. +- Test files prefixed with `_` (e.g., `_test_methylation-preprocess.yaml`) are disabled. +- Sprocket outputs land in `output/runs/*/_latest/outputs.json`, then get copied to `output/outputs.json`. miniwdl outputs go directly to `output/`. Test YAMLs reference `output/outputs.json`. +- Fixture inputs are downsampled to small chromosomes (chrY/chrM, chr9/chr22) for speed. + +## Existing instruction sources + +- `.github/instructions/wdl.instructions.md` — applies to `**/*.wdl`; references CONTRIBUTING.md, best-practices.md, template/task-examples.wdl, and template/common-parameter-meta.txt. +- `template/task-examples.wdl` — canonical WDL task patterns; read before writing new tasks. +- `template/common-parameter-meta.txt` — required/banned parameter_meta strings. +- `CONTRIBUTING.md` — general coding style. +- `best-practices.md` — WDL-specific best practices. diff --git a/data_structures/CHANGELOG.md b/data_structures/CHANGELOG.md index 21e57972b..37d41d27a 100644 --- a/data_structures/CHANGELOG.md +++ b/data_structures/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). +## 2026 May + +### Added + +- New `read_group.read_group_to_array` task to convert a `ReadGroup` struct to `Array[String]` [#282](https://github.com/stjudecloud/workflows/pull/282) + ## 2026 February ### Changed diff --git a/data_structures/read_group.wdl b/data_structures/read_group.wdl index 94632b4c6..6f9bd90c4 100644 --- a/data_structures/read_group.wdl +++ b/data_structures/read_group.wdl @@ -425,3 +425,50 @@ task inner_read_group_to_string { maxRetries: 1 } } + +task read_group_to_array { + meta { + description: "Converts a `ReadGroup` struct to a `Array[String]` **without any validation**." + outputs: { + converted_read_group: "Input `ReadGroup` as a `Array[String]`", + } + } + + parameter_meta { + read_group: "`ReadGroup` struct to convert to array" + } + + input { + ReadGroup read_group + } + + String delimiter = "\n" + + command <<< + { + echo -n "~{"ID:" + read_group.ID}" # required field. All others optional + echo -n "~{delimiter + "BC:" + read_group.BC}" + echo -n "~{delimiter + "CN:" + read_group.CN}" + echo -n "~{delimiter + "DS:" + read_group.DS}" + echo -n "~{delimiter + "DT:" + read_group.DT}" + echo -n "~{delimiter + "FO:" + read_group.FO}" + echo -n "~{delimiter + "KS:" + read_group.KS}" + echo -n "~{delimiter + "LB:" + read_group.LB}" + echo -n "~{delimiter + "PG:" + read_group.PG}" + echo -n "~{delimiter + "PI:" + read_group.PI}" + echo -n "~{delimiter + "PL:" + read_group.PL}" + echo -n "~{delimiter + "PM:" + read_group.PM}" + echo -n "~{delimiter + "PU:" + read_group.PU}" + echo "~{delimiter + "SM:" + read_group.SM}" + } >> out.txt + >>> + + output { + Array[String] converted_read_group = read_lines("out.txt") + } + + runtime { + container: "ghcr.io/stjudecloud/util:3.0.3" + maxRetries: 1 + } +} diff --git a/docker/bwamem2/Dockerfile b/docker/bwamem2/Dockerfile new file mode 100644 index 000000000..a79bab258 --- /dev/null +++ b/docker/bwamem2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/bwa-mem2:2.3--he70b90d_0 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "bwa-mem2" ] \ No newline at end of file diff --git a/docker/bwamem2/package.json b/docker/bwamem2/package.json new file mode 100644 index 000000000..8a483d363 --- /dev/null +++ b/docker/bwamem2/package.json @@ -0,0 +1,5 @@ +{ + "name": "bwamem2", + "version": "2.3", + "revision": "0" +} \ No newline at end of file diff --git a/docker/hisat2/Dockerfile b/docker/hisat2/Dockerfile new file mode 100644 index 000000000..85b3898dd --- /dev/null +++ b/docker/hisat2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/hisat2:2.2.1--hdbdd923_7 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "hisat2" ] \ No newline at end of file diff --git a/docker/hisat2/package.json b/docker/hisat2/package.json new file mode 100644 index 000000000..e17679260 --- /dev/null +++ b/docker/hisat2/package.json @@ -0,0 +1,5 @@ +{ + "name": "hisat2", + "version": "2.2.1", + "revision": "0" +} \ No newline at end of file diff --git a/docker/minimap2/Dockerfile b/docker/minimap2/Dockerfile new file mode 100644 index 000000000..7fb04869b --- /dev/null +++ b/docker/minimap2/Dockerfile @@ -0,0 +1,8 @@ +FROM quay.io/biocontainers/samtools:1.17--h00cdaf9_0 AS samtools +FROM quay.io/biocontainers/minimap2:2.30--h577a1d6_0 + +COPY --from=samtools /usr/local/bin/ /usr/local/bin/ +COPY --from=samtools /usr/local/lib/ /usr/local/lib/ +COPY --from=samtools /usr/local/libexec/ /usr/local/libexec/ + +ENTRYPOINT [ "minimap2" ] \ No newline at end of file diff --git a/docker/minimap2/package.json b/docker/minimap2/package.json new file mode 100644 index 000000000..a78b7377c --- /dev/null +++ b/docker/minimap2/package.json @@ -0,0 +1,5 @@ +{ + "name": "minimap2", + "version": "2.30", + "revision": "0" +} \ No newline at end of file diff --git a/docker/ngsep/Dockerfile b/docker/ngsep/Dockerfile new file mode 100644 index 000000000..86226e361 --- /dev/null +++ b/docker/ngsep/Dockerfile @@ -0,0 +1,5 @@ +FROM eclipse-temurin:21 + +RUN wget https://github.com/NGSEP/NGSEPcore/releases/download/v5.1.0/NGSEPcore_5.1.0.jar -O /usr/local/bin/NGSEPcore.jar + +ENTRYPOINT [ "java", "-jar", "/usr/local/bin/NGSEPcore.jar" ] \ No newline at end of file diff --git a/docker/ngsep/package.json b/docker/ngsep/package.json new file mode 100644 index 000000000..19a575139 --- /dev/null +++ b/docker/ngsep/package.json @@ -0,0 +1,5 @@ +{ + "name": "ngsep", + "version": "5.1.0", + "revision": "0" +} diff --git a/test/fixtures/bams/README.md b/test/fixtures/bams/README.md index 5a4b8d0ec..22dd28736 100644 --- a/test/fixtures/bams/README.md +++ b/test/fixtures/bams/README.md @@ -40,6 +40,14 @@ Coordinate sorted BAM with 1 read group (`ID:test`). Aligned using BWA `aln` (us BAM index corresponding to `test.bwa_aln_pe.chrY_chrM.bam`. +## test.bwa_aln_pe.with_variants.chrY_chrM.bam + +Coordinate sorted BAM with 1 read group (`ID:test`). Aligned using BWA `aln` (using the `bwa.bwa_aln_pe` WDL task). 100,000 Paired-End reads mapped to GRCh38 `chrY` and `chrM` from `test_variants_R1.fq.gz` and `test_variants_R2.fq.gz`. + +## test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + +BAM index corresponding to `test.bwa_aln_pe.with_variants.chrY_chrM.bam`. + ## test.extra_RG.bam A duplicate of `test.bam` with an added `@RG` entry with the ID `no_match`. There are no reads corresponding to the `no_match` RG entry. diff --git a/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam b/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam new file mode 100644 index 000000000..bc8bc4113 --- /dev/null +++ b/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b96e2b6bbced6faf80a8654abcf6cfe9e725c813bbd169c3cc5d63aa3b7fdaa +size 14454382 diff --git a/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai b/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai new file mode 100644 index 000000000..eda6b8b7d --- /dev/null +++ b/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b85c6e5599a5f0e707f9e5dc024e4ab7abf17f437eedd5f3e1016ee3d401a2fa +size 34552 diff --git a/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.csi b/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.csi new file mode 100644 index 000000000..b6d6f97ac Binary files /dev/null and b/test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.csi differ diff --git a/test/fixtures/fastqs/README.md b/test/fixtures/fastqs/README.md index 01d3d43ba..a37c3821d 100644 --- a/test/fixtures/fastqs/README.md +++ b/test/fixtures/fastqs/README.md @@ -19,3 +19,11 @@ The following list is sorted alphabetically: ## test_R2.fq.gz 10,000 reads in FASTQ format. Can be used by itself to represent a Single-End sample or used with `test_R1.fq.gz` to represent a Paired-End sample. Generated with `fq generate`. + +## test_variants_R1.fq.gz + +10,000 reads in FASTQ format. Can be used by itself to represent a Single-End sample or used with `test_variants.R2.fq.gz` to represent a Paired-End sample. Generated with `fq generate` on `chrY` and `chrM`. Simulated from a reference sequence containing variants. + +## test_variants_R2.fq.gz + +10,000 reads in FASTQ format. Can be used by itself to represent a Single-End sample or used with `test_variants.R1.fq.gz` to represent a Paired-End sample. Generated with `fq generate` on `chrY` and `chrM`. Simulated from a reference sequence containing variants. \ No newline at end of file diff --git a/test/fixtures/fastqs/test_variants_R1.fq.gz b/test/fixtures/fastqs/test_variants_R1.fq.gz new file mode 100644 index 000000000..8773c9323 --- /dev/null +++ b/test/fixtures/fastqs/test_variants_R1.fq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b4c779b6b85872895b74fcd4a7322e903f70160ac0d859ebc55b4286260d412 +size 5836265 diff --git a/test/fixtures/fastqs/test_variants_R2.fq.gz b/test/fixtures/fastqs/test_variants_R2.fq.gz new file mode 100644 index 000000000..210a9ed1f --- /dev/null +++ b/test/fixtures/fastqs/test_variants_R2.fq.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60cacae27425f1a844731c12a35c8b836b58f1b80c0addd2ce836c07476f2db1 +size 5835338 diff --git a/test/fixtures/mutect2/README.md b/test/fixtures/mutect2/README.md new file mode 100644 index 000000000..5dc946753 --- /dev/null +++ b/test/fixtures/mutect2/README.md @@ -0,0 +1,29 @@ +# Mutect2 fixtures + +Inputs and intermediate artifacts used by the per-task tests in `tools/test/mutect2.yaml` for `tools/mutect2.wdl`. Files are downsampled to `chrY` and `chrM` to match the reference at `test/fixtures/reference/GRCh38.chrY_chrM.fa` and the BAMs at `test/fixtures/bams/test.bwa_aln_pe*.chrY_chrM.bam`. + +The following list is sorted alphabetically: + +## af-only-gnomad.hg38.chrY_chrM.vcf.gz + +GATK best-practices germline resource (`af-only-gnomad.hg38.vcf.gz` from the GATK resource bundle), subset to `chrY` and `chrM`. Used as the `germline_resource_vcf` input for the `mutect2` task and as both `intervals` and `variants` inputs for the `get_pileup_summaries` task. + +## af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi + +Tabix index for `af-only-gnomad.hg38.chrY_chrM.vcf.gz`. + +## test.bwa_aln_pe.chrY_chrM_pileup_summaries.table + +Output of `gatk GetPileupSummaries` run on `test/fixtures/bams/test.bwa_aln_pe.chrY_chrM.bam` (normal sample) over the sites in `af-only-gnomad.hg38.chrY_chrM.vcf.gz`. Used as the `normal_pileups` input for the `calculate_contamination` task. + +## test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.contamination.table + +Output of `gatk CalculateContamination` run on the tumor and normal pileup tables (matched-normal mode). Used as the `contamination_table` input for the `filter_mutect` task. + +## test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.segments.table + +Tumor segmentation output of the same `gatk CalculateContamination` invocation that produced the contamination table. Used as the `maf_segments` input for the `filter_mutect` task. + +## test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.table + +Output of `gatk GetPileupSummaries` run on `test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam` (tumor sample) over the sites in `af-only-gnomad.hg38.chrY_chrM.vcf.gz`. Used as the `tumor_pileups` input for the `calculate_contamination` task. diff --git a/test/fixtures/mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz b/test/fixtures/mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz new file mode 100644 index 000000000..6cf198e4f --- /dev/null +++ b/test/fixtures/mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fd569f63658375cd96ee99a6f7fd8dc4dfb635e93e3ebc6d69cf20b36bbdf64 +size 2892 diff --git a/test/fixtures/mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi b/test/fixtures/mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi new file mode 100644 index 000000000..a905a522f Binary files /dev/null and b/test/fixtures/mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi differ diff --git a/test/fixtures/mutect2/test.bwa_aln_pe.chrY_chrM_pileup_summaries.table b/test/fixtures/mutect2/test.bwa_aln_pe.chrY_chrM_pileup_summaries.table new file mode 100644 index 000000000..5673be665 --- /dev/null +++ b/test/fixtures/mutect2/test.bwa_aln_pe.chrY_chrM_pileup_summaries.table @@ -0,0 +1,2 @@ +#SAMPLE=test +contig position ref_count alt_count other_alt_count allele_frequency diff --git a/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.contamination.table b/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.contamination.table new file mode 100644 index 000000000..5fde5fa4c --- /dev/null +++ b/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.contamination.table @@ -0,0 +1,2 @@ +sample contamination error +test 0.0 1.0 diff --git a/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.segments.table b/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.segments.table new file mode 100644 index 000000000..a49f88276 --- /dev/null +++ b/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.segments.table @@ -0,0 +1,2 @@ +#SAMPLE=test +contig start end minor_allele_fraction diff --git a/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.table b/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.table new file mode 100644 index 000000000..5673be665 --- /dev/null +++ b/test/fixtures/mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.table @@ -0,0 +1,2 @@ +#SAMPLE=test +contig position ref_count alt_count other_alt_count allele_frequency diff --git a/test/fixtures/reference/GRCh38.chrY_chrM.bwamem2_db.tar.gz b/test/fixtures/reference/GRCh38.chrY_chrM.bwamem2_db.tar.gz new file mode 100644 index 000000000..77b0aff80 --- /dev/null +++ b/test/fixtures/reference/GRCh38.chrY_chrM.bwamem2_db.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8642965d4fbf6d62d9297bdcf0421a173f64ab5c2e85a44a1ebe03a4c3b1fbad +size 138219634 diff --git a/test/fixtures/reference/GRCh38.chrY_chrM.dist b/test/fixtures/reference/GRCh38.chrY_chrM.dist new file mode 100644 index 000000000..b71a42193 --- /dev/null +++ b/test/fixtures/reference/GRCh38.chrY_chrM.dist @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc64ba00b1b2678d30ebb3e132013ec0f4bdf524180a6800b511b76b53bee0a7 +size 25291032 diff --git a/test/fixtures/reference/GRCh38.chrY_chrM.giraffe.gbz b/test/fixtures/reference/GRCh38.chrY_chrM.giraffe.gbz new file mode 100644 index 000000000..8a51119cd --- /dev/null +++ b/test/fixtures/reference/GRCh38.chrY_chrM.giraffe.gbz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:becb0f3e33b88d65cca4388dea6e1dd9390a18ea9060f8455554ab027d71c4aa +size 48543456 diff --git a/test/fixtures/reference/GRCh38.chrY_chrM.mmi b/test/fixtures/reference/GRCh38.chrY_chrM.mmi new file mode 100644 index 000000000..bb8e7b3a2 --- /dev/null +++ b/test/fixtures/reference/GRCh38.chrY_chrM.mmi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12178a52129f6a7482b6b1af1e622fc373e48a6e0b79335fa4be2e0f9774216a +size 90529898 diff --git a/test/fixtures/reference/GRCh38.chrY_chrM.shortread.withzip.min b/test/fixtures/reference/GRCh38.chrY_chrM.shortread.withzip.min new file mode 100644 index 000000000..18a76f82b --- /dev/null +++ b/test/fixtures/reference/GRCh38.chrY_chrM.shortread.withzip.min @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062e93650648e0e1cce77c97e56acb721e68d070d1089d34801a745f218dafa1 +size 174418792 diff --git a/test/fixtures/reference/GRCh38.chrY_chrM.shortread.zipcodes b/test/fixtures/reference/GRCh38.chrY_chrM.shortread.zipcodes new file mode 100644 index 000000000..f18baa9dc --- /dev/null +++ b/test/fixtures/reference/GRCh38.chrY_chrM.shortread.zipcodes @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9fbe6a93a2cb96c8c2b69c0da5bb34a824750aadc75c7ca25b402052780ca5f +size 8 diff --git a/test/fixtures/reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz b/test/fixtures/reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz new file mode 100644 index 000000000..3071a47bd --- /dev/null +++ b/test/fixtures/reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cabc413c761adf0bf7a1c06dd40d76596815d304c752b86ff734e7a1bad16a35 +size 54714 diff --git a/test/fixtures/reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz.tbi b/test/fixtures/reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz.tbi new file mode 100644 index 000000000..2e2bf5583 Binary files /dev/null and b/test/fixtures/reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz.tbi differ diff --git a/test/fixtures/vcfs/README.md b/test/fixtures/vcfs/README.md index 536be0a95..45e20db19 100644 --- a/test/fixtures/vcfs/README.md +++ b/test/fixtures/vcfs/README.md @@ -4,6 +4,18 @@ All the files in this directory were either randomly generated or sourced from p The following list is sorted alphabetically: +## test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz + +Unfiltered Mutect2 somatic VCF produced by running `gatk Mutect2` with `test/fixtures/bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam` as the tumor sample and `test/fixtures/bams/test.bwa_aln_pe.chrY_chrM.bam` as the matched normal, against `test/fixtures/reference/GRCh38.chrY_chrM.fa`. Used as the `unfiltered_somatic_vcf` input for the `filter_mutect` task in `tools/mutect2.wdl`. + +## test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.tbi + +Tabix index for `test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz`. + +## test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.stats + +Mutect2 stats sidecar emitted alongside `test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz`. Used as the `unfiltered_somatic_vcf_stats` input for the `filter_mutect` task. + ## test1.vcf.gz Single position VCF file @@ -19,3 +31,11 @@ Single position VCF file ## test2.vcf.gz.tbi Index file for `test2.vcf.gz`. + +## testY.vcf.gz + +Single position VCF file on `chrY` + +## testY.vcf.gz.tbi + +Index file for `testY.vcf.gz`. \ No newline at end of file diff --git a/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz b/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz new file mode 100644 index 000000000..a83f3e1e2 --- /dev/null +++ b/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9699d5bddac8b5b213ac2a6085fe7dfcbc36330d63e7a3629793dd4c160f5f8c +size 4324 diff --git a/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.stats b/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.stats new file mode 100644 index 000000000..a411aa130 --- /dev/null +++ b/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.stats @@ -0,0 +1,2 @@ +statistic value +callable 157175.0 diff --git a/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.tbi b/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.tbi new file mode 100644 index 000000000..8ab61e09d Binary files /dev/null and b/test/fixtures/vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.tbi differ diff --git a/test/fixtures/vcfs/testY.vcf.gz b/test/fixtures/vcfs/testY.vcf.gz new file mode 100644 index 000000000..139818f4f --- /dev/null +++ b/test/fixtures/vcfs/testY.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dc460c4815104a438cbbf39102559d1d01e1c8cf68f6e2c01755ff5f918f0f6 +size 695 diff --git a/test/fixtures/vcfs/testY.vcf.gz.tbi b/test/fixtures/vcfs/testY.vcf.gz.tbi new file mode 100644 index 000000000..aecb937ab Binary files /dev/null and b/test/fixtures/vcfs/testY.vcf.gz.tbi differ diff --git a/tools/CHANGELOG.md b/tools/CHANGELOG.md index 045c1e109..92923bee8 100644 --- a/tools/CHANGELOG.md +++ b/tools/CHANGELOG.md @@ -4,6 +4,39 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). +## 2026 May + +### Added + +- New `bwamem2.wdl` tool wrapper for the BWA-MEM2 aligner [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `clair.wdl` tool wrapper for the Clair3 variant caller [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `deepvariant.wdl` tool wrapper for Google DeepVariant [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `hisat2.wdl` tool wrapper for HISAT2 (tasks marked deprecated on introduction) [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `manta.wdl` tool wrapper for the Manta structural variant caller [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `minimap2.wdl` tool wrapper for minimap2 [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `mutect2.wdl` tool wrapper for the GATK Mutect2 somatic variant calling workflow [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `ngsep.wdl` tool wrapper for NGSEP germline variant calling [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `strelka.wdl` tool wrapper for the Strelka2 germline and somatic variant callers [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `vg.wdl` tool wrapper for the `vg` variant graph aligner [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `gatk4` tasks `apply_vqsr`, `variant_recalibrator`, `calculate_genotype_posteriors`, `genotype_gvcfs`, and `resource_to_string`, plus `Resource` struct and `ref_confidence`/`VariantMode` enums [#282](https://github.com/stjudecloud/workflows/pull/282) +- New `samtools.calmd` and `samtools.sort` tasks [#282](https://github.com/stjudecloud/workflows/pull/282) + +### Changed + +- `gatk4.wdl` bumped from WDL 1.1 to WDL 1.3 [#282](https://github.com/stjudecloud/workflows/pull/282) +- Raised default disk allocations across most `tools/*.wdl` tasks (typically +10 → +30 GB) to accommodate larger inputs [#282](https://github.com/stjudecloud/workflows/pull/282) +- `bwa.bwa_mem` memory raised from 25 GB to 120 GB; `samtools_cores` calculation now floors at 1 for low-CPU runs (applies to all three `bwa.bwa_*` tasks) [#282](https://github.com/stjudecloud/workflows/pull/282) +- `samtools.addreplacerg` memory raised from 4 GB to 8 GB; default disk allocation raised by 40 GB [#282](https://github.com/stjudecloud/workflows/pull/282) +- `picard.mark_duplicates` and `picard.sort` now use a task-local `tmp/` directory for Java temp files [#282](https://github.com/stjudecloud/workflows/pull/282) +- `picard.sort` default memory raised from 25 GB to 35 GB [#282](https://github.com/stjudecloud/workflows/pull/282) +- `picard.validate_bam` default disk allocation now scales with BAM size (`bam_size * 4 + 50`) instead of `bam_size + 10` [#282](https://github.com/stjudecloud/workflows/pull/282) +- `arriba.arriba_extract_fusion_supporting_alignments` now localizes BAM and BAI into the task working directory before invocation [#282](https://github.com/stjudecloud/workflows/pull/282) +- `util.make_coverage_regions_bed` now explicitly requests 8 GB of memory [#282](https://github.com/stjudecloud/workflows/pull/282) + +### Fixed + +- `picard.create_sequence_dictionary` now correctly derives the default `outfile_name` for FASTA inputs with `.fasta`, `.fna`, or `.gz`-compressed extensions (previously only `.fa` was stripped, producing names like `genome.fa.gz.dict`) [#282](https://github.com/stjudecloud/workflows/pull/282) + ## 2026 February ### Changed diff --git a/tools/arriba.wdl b/tools/arriba.wdl index e5a0506bb..225da99f0 100644 --- a/tools/arriba.wdl +++ b/tools/arriba.wdl @@ -359,10 +359,18 @@ task arriba_extract_fusion_supporting_alignments { Int disk_size_gb = ceil(input_size_gb) + 5 + modify_disk_size_gb command <<< + set -euo pipefail + + bam_name=~{basename(bam)} + ln -sf "~{bam}" "$bam_name" + ln -sf "~{bam_index}" "$bam_name.bai" + extract_fusion-supporting_alignments.sh \ "~{fusions}" \ - "~{bam}" \ + "$bam_name" \ "~{prefix}" + + rm "$bam_name" "$bam_name.bai" >>> output { diff --git a/tools/bwa.wdl b/tools/bwa.wdl index a27cb71a0..70598ed1b 100644 --- a/tools/bwa.wdl +++ b/tools/bwa.wdl @@ -58,7 +58,7 @@ task bwa_aln { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -151,7 +151,7 @@ task bwa_aln_pe { n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -236,14 +236,14 @@ task bwa_mem { ) command <<< - set -euo pipefail + set -xeuo pipefail n_cores=~{ncpu} if ~{use_all_cores}; then n_cores=$(nproc) fi # -1 because samtools uses one more core than `--threads` specifies - (( samtools_cores = n_cores - 1 )) + (( samtools_cores = n_cores - 1 || 1 )) mkdir bwa_db tar -C bwa_db -xzf "~{bwa_db_tar_gz}" --no-same-owner @@ -282,7 +282,7 @@ task bwa_mem { runtime { cpu: ncpu - memory: "25 GB" + memory: "120 GB" disks: "~{disk_size_gb} GB" container: "ghcr.io/stjudecloud/bwa:0.7.17-2" maxRetries: 1 diff --git a/tools/bwamem2.wdl b/tools/bwamem2.wdl new file mode 100644 index 000000000..8e1b39c0e --- /dev/null +++ b/tools/bwamem2.wdl @@ -0,0 +1,136 @@ +version 1.2 + +task align { + meta { + description: "Align DNA sequences against a large reference database using BWA-MEM2" + outputs: { + alignments: "The output alignment file in SAM format", + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with BWA-MEM2" + reference_index: "The BWA-MEM2 index file for the reference genome" + read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with BWA-MEM2" + prefix: "Prefix for the BAM file. The extension `.bam` will be added." + smart_pairing: "If true, enable smart pairing mode for paired-end reads" + skip_mate_rescue: "If true, skip mate rescue for paired-end reads" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + seed_length: "Seed value for the BWA-MEM2 aligner" + min_score: "Minimum score threshold for reporting alignments" + } + + input { + File read_one_fastq_gz + File reference_index + String read_group + File? read_two_fastq_gz + String prefix = sub(basename(read_one_fastq_gz), "([_\\.][rR][12])?(\\.subsampled)?\\.(fastq|fq)(\\.gz)?$", + "") + Boolean smart_pairing = false + Boolean skip_mate_rescue = false + Int threads = 4 + Int modify_disk_size_gb = 0 + Int seed_length = 19 + Int min_score = 30 + } + + String output_name = prefix + ".bam" + Int disk_size_gb = ceil((size(read_one_fastq_gz, "GB") + size(read_two_fastq_gz, "GB") + ) * 2) + ceil(size(reference_index, "GB")) + 30 + modify_disk_size_gb + + command <<< + set -euo pipefail + + mkdir bwa_db + tar -C bwa_db -xzf "~{reference_index}" --no-same-owner + PREFIX=$(basename bwa_db/*.ann ".ann") + + bwa-mem2 mem \ + -t ~{threads} \ + -R "~{read_group}" \ + -k ~{seed_length} \ + -T ~{min_score} \ + ~{if smart_pairing + then "-p" + else "" + } \ + ~{if skip_mate_rescue + then "-S" + else "" + } \ + bwa_db/"$PREFIX" \ + "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) + then "\"~{read_two_fastq_gz}\"" + else "" + } | + samtools view -b -o "~{output_name}" - + + rm -r bwa_db + >>> + + output { + File alignments = output_name + } + + requirements { + container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" + cpu: threads + memory: "~{4 * threads} GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with minimap2" + outputs: { + reference_index: "The minimap2 index file for the reference genome", + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + db_name: "The base name for the output index files" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + String db_name = "reference" + Int modify_disk_size_gb = 0 + } + + Float input_fasta_size = size(reference_fasta, "GB") + Int disk_size_gb = ceil(input_fasta_size * 2) + 10 + modify_disk_size_gb + String bwa_db_out_name = db_name + ".tar.gz" + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + bwa-mem2 index \ + "$ref_fasta" + + tar -czf "~{bwa_db_out_name}" "$ref_fasta"* + + rm -r "$ref_fasta" + >>> + + output { + File reference_index = bwa_db_out_name + } + + requirements { + container: "ghcr.io/stjudecloud/bwamem2:branch-minimap2-2.3-0" + cpu: 1 + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/clair.wdl b/tools/clair.wdl new file mode 100644 index 000000000..fbe2ef40b --- /dev/null +++ b/tools/clair.wdl @@ -0,0 +1,310 @@ +version 1.3 + +enum platform { + ont_r10_dorado_sup_4khz, + ont_r10_dorado_sup_5khz, + ont_r10_dorado_hac_5khz, + ont_r10_dorado_hac_4khz, + ont_r10_guppy, + ont_r9_guppy, + ilmn, + hifi_sequel2, + hifi_revio, +} + +task clair3 { + meta { + description: "Run Clair3 variant caller for small variants using deep neural networks" + outputs: { + pileup_vcf: "VCF file with variants called using pileup model", + full_alignment_vcf: "VCF file with variants called using full-alignment model", + merged_vcf: "Final merged VCF file with variants from both models", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads" + bam_index: "Index file for the input BAM file" + model: "Pre-trained Clair3 model to use for variant calling" + bed_regions: "Optional BED file specifying regions to call variants in" + vcf_candidates: "Optional VCF file with candidate variants to consider" + contigs: "Optional list of contigs to call variants in. If undefined, all contigs in the reference FASTA will be considered." + output_dir: "Directory to store Clair3 output" + platform: { + description: "Sequencing platform used to generate the reads", + choices: [ + "ont", + "hifi", + "ilmn", + ], + } + all_contigs: "Boolean indicating whether to include all contigs in variant calling. If false only chr{1..22,X,Y} are called." + print_ref_calls: "Boolean indicating whether to print reference calls in the output VCF" + gvcf: "Boolean indicating whether to output gVCF format" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + File bam_index + String model + File? bed_regions + File? vcf_candidates + Array[String] contigs = [] + String output_dir = "clair3_output" + String platform = "ilmn" + Boolean all_contigs = false + Boolean print_ref_calls = false + Boolean gvcf = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(bam, "GB") * 2) + 150 + + modify_disk_size_gb + + String filename = basename(bam) + + #@ except: ShellCheck + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + # Clair-3 resolves the BAM path... + cp "~{bam}" "~{filename}" + cp "~{bam_index}" "~{filename}.bai" + + run_clair3.sh \ + --bam_fn="~{filename}" \ + --ref_fn="$ref_fasta" \ + --threads="~{threads}" \ + --platform="~{platform}" \ + --model_path="/opt/models/~{model}" \ + --output="~{output_dir}" \ + ~{if length(contigs) > 0 + then "--ctg_name='~{sep(",", contigs)}'" + else "" + } \ + ~{if all_contigs + then "--include_all_ctgs" + else "" + } \ + ~{if print_ref_calls + then "--print_ref_calls" + else "" + } \ + ~{if defined(bed_regions) + then "--bed_fn='~{bed_regions}'" + else "" + } \ + ~{if defined(vcf_candidates) + then "--vcf_fn='~{vcf_candidates}'" + else "" + } \ + ~{if gvcf + then "--gvcf" + else "" + } + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{filename}" "~{filename}.bai" + >>> + + output { + File pileup_vcf = "~{output_dir}/pileup.vcf.gz" + File full_alignment_vcf = "~{output_dir}/full_alignment.vcf.gz" + File merged_vcf = "~{output_dir}/merge_output.vcf.gz" + } + + requirements { + # container: "quay.io/biocontainers/clair3:1.2.0--py310h779eee5_0" + container: "hkubal/clair3:v2.0.0" + cpu: threads + memory: "64 GB" + disks: "~{disk_size_gb} GB" + } +} + +task clairs { + meta { + description: "Run ClairS paired sample variant caller" + outputs: { + vcf: "VCF file with somatic variants called by ClairS", + vcf_index: "Index for `vcf`", + } + } + + parameter_meta { + tumor_bam: "Input BAM file with aligned reads for tumor sample" + tumor_bam_index: "Index file for the tumor BAM file" + normal_bam: "Input BAM file with aligned reads for normal sample" + normal_bam_index: "Index file for the normal BAM file" + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + platform: { + description: "Sequencing platform used to generate the reads", + choices: [ + "ont_r10_dorado_sup_4khz", + "ont_r10_dorado_sup_5khz", + "ont_r10_dorado_hac_5khz", + "ont_r10_dorado_hac_4khz", + "ont_r10_guppy", + "ont_r9_guppy", + "ilmn", + "hifi_sequel2", + "hifi_revio", + ], + } + bed_regions: "Optional BED file specifying regions to call variants in" + vcf_candidates: "Optional VCF file with candidate variants to consider" + pileup_model: "Optional pre-trained ClairS pileup model to use for variant calling" + full_alignment_model: "Optional pre-trained ClairS full-alignment model to use for variant calling" + snv_min_qual: "Minimum quality score required to call a SNV" + indel_min_qual: "Minimum quality score required to call an indel" + contigs: "Optional list of contigs to call variants in. If undefined, all contigs in the reference FASTA will be considered." + prefix: "Prefix for ClairS output files" + sample_name: "Sample name to use in the output VCF" + output_dir: "Directory to store ClairS output" + all_contigs: "Boolean indicating whether to include all contigs in variant calling. If false only chr{1..22,X,Y} are called." + print_ref_calls: "Boolean indicating whether to print reference calls in the output VCF" + print_germline_calls: "Boolean indicating whether to print germline calls in the output VCF" + remove_intermediate_directory: "Boolean indicating whether to remove the intermediate output directory generated by ClairS after the final VCF is produced" + snv_min_af: "Minimum allele frequency required to call a SNV" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + min_coverage: "Minimum coverage required at a site to make a variant call" + chunk_size: "Size of genomic chunks in base pairs to split the input BAMs into for parallel processing" + } + + input { + File tumor_bam + File tumor_bam_index + File normal_bam + File normal_bam_index + File reference_fasta + File reference_fasta_index + platform platform + File? bed_regions + File? vcf_candidates + File? pileup_model + File? full_alignment_model + Int? snv_min_qual + Int? indel_min_qual + Array[String] contigs = [] + String prefix = "output" + String sample_name = "SAMPLE" + String output_dir = "output" + Boolean all_contigs = false + Boolean print_ref_calls = false + Boolean print_germline_calls = false + Boolean remove_intermediate_directory = false + Float snv_min_af = 0.05 + Int threads = 4 + Int modify_disk_size_gb = 0 + Int min_coverage = 4 + Int chunk_size = 5000000 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(tumor_bam, "GB") + * 2) + ceil(size(normal_bam, "GB") * 2) + 50 + modify_disk_size_gb + + String tumor = basename(tumor_bam) + String normal = basename(normal_bam) + + #@ except: ShellCheck + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + cp "~{tumor_bam}" "~{tumor}" + cp "~{tumor_bam_index}" "~{tumor}.bai" + cp "~{normal_bam}" "~{normal}" + cp "~{normal_bam_index}" "~{normal}.bai" + + #@except: ShellCheck + run_clairs \ + --tumor_bam_fn "~{tumor}" \ + --normal_bam_fn "~{normal}" \ + --ref_fn "$ref_fasta" \ + --threads "~{threads}" \ + --platform "~{platform}" \ + --output_dir "~{output_dir}" \ + --output_prefix "~{prefix}" \ + --min_coverage "~{min_coverage}" \ + --chunk_size "~{chunk_size}" \ + --snv_min_af "~{snv_min_af}" \ + --sample_name "~{sample_name}" \ + ~{if length(contigs) > 0 + then "--ctg_name='~{sep(",", contigs)}'" + else "" + } \ + ~{if defined(bed_regions) + then "--bed_fn '~{bed_regions}'" + else "" + } \ + ~{if defined(vcf_candidates) + then "--genotyping_mode_vcf_fn '~{vcf_candidates}'" + else "" + } \ + ~{if all_contigs + then "--include_all_ctgs" + else "" + } \ + ~{if print_ref_calls + then "--print_ref_calls" + else "" + } \ + ~{if print_germline_calls + then "--print_germline_calls" + else "" + } \ + ~{if defined(pileup_model) + then "--pileup_model_path '~{pileup_model}'" + else "" + } \ + ~{if defined(full_alignment_model) + then "--full_alignment_model_path '~{full_alignment_model}'" + else "" + } \ + ~{if defined(snv_min_qual) + then "--snv_min_qual '~{snv_min_qual}'" + else "" + } \ + ~{if defined(indel_min_qual) + then "--indel_min_qual '~{indel_min_qual}'" + else "" + } \ + ~{if remove_intermediate_directory + then "--remove_intermediate_directory" + else "" + } + + rm -rf "$ref_fasta" "$ref_fasta.fai" \ + "~{tumor}" "~{tumor}.bai" \ + "~{normal}" "~{normal}.bai" + >>> + + output { + File vcf = "~{output_dir}/~{prefix}.vcf.gz" + File vcf_index = "~{output_dir}/~{prefix}.vcf.gz.tbi" + } + + requirements { + container: "hkubal/clairs:v0.4.4" + cpu: threads + memory: "64 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/deepvariant.wdl b/tools/deepvariant.wdl new file mode 100644 index 000000000..95598b9a6 --- /dev/null +++ b/tools/deepvariant.wdl @@ -0,0 +1,271 @@ +version 1.3 + +enum ModelType { + WGS, + WES, + PACBIO, + ONT, + FFPE_WGS, + FFPE_WES, + FFPE_WGS_TUMOR_ONLY, + FFPE_WES_TUMOR_ONLY, + WGS_TUMOR_ONLY, + WES_TUMOR_ONLY, + PACBIO_TUMOR_ONLY, + ONT_TUMOR_ONLY, +} + +task deepsomatic { + meta { + description: "Call variants using DeepSomatic" + outputs: { + vcf_output: "VCF file containing called somatic variants", + vcf_output_index: "Index file for the called somatic variants VCF", + gvcf_output: "gVCF file containing called somatic variants", + gvcf_output_index: "Index file for the called somatic variants gVCF", + runtime_html: "Optional HTML report of runtime metrics", + vcf_stats: "Optional HTML report of VCF statistics", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + tumor_bam_index: "Index for tumor BAM file" + normal_bam: "Input BAM file with aligned reads for normal sample" + normal_bam_index: "Index for normal BAM file" + model_type: { + description: "Type of model to use for variant calling", + choices: [ + "WGS", + "WES", + "PACBIO", + "ONT", + "FFPE_WGS", + "FFPE_WES", + "FFPE_WGS_TUMOR_ONLY", + "FFPE_WES_TUMOR_ONLY", + "WGS_TUMOR_ONLY", + "WES_TUMOR_ONLY", + "PACBIO_TUMOR_ONLY", + "ONT_TUMOR_ONLY", + ], + } + output_prefix: "Prefix for output VCF and gVCF files" + tumor_sample_name: "Sample name for the tumor sample" + normal_sample_name: "Sample name for the normal sample" + runtime_report: "Output make_examples_somatic runtime metrics and create a visual runtime report using runtime_by_region_vis." + vcf_stats_report: "Output a visual report (HTML) of statistics about the output VCF." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File tumor_bam + File tumor_bam_index + File normal_bam + File normal_bam_index + ModelType model_type + String output_prefix = "deepsomatic_output" + String tumor_sample_name = "tumor" + String normal_sample_name = "normal" + Boolean runtime_report = false + Boolean vcf_stats_report = false + Int threads = 8 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(tumor_bam, "GB") + * 2) + ceil(size(normal_bam, "GB") * 2) + 50 + modify_disk_size_gb + + String tumor = basename(tumor_bam) + String normal = basename(normal_bam) + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + ln -s "~{tumor_bam}" "~{tumor}" + ln -s "~{tumor_bam_index}" "~{tumor}.bai" + ln -s "~{normal_bam}" "~{normal}" + ln -s "~{normal_bam_index}" "~{normal}.bai" + + run_deepsomatic \ + --model_type="~{model_type}" \ + --ref="$ref_fasta" \ + --reads_tumor="~{tumor}" \ + --reads_normal="~{normal}" \ + --output_vcf="~{output_prefix}.vcf.gz" \ + --output_gvcf="~{output_prefix}.g.vcf.gz" \ + --sample_name_tumor="~{tumor_sample_name}" \ + --sample_name_normal="~{normal_sample_name}" \ + --num_shards="~{threads}" \ + --logging_dir="logs" \ + --intermediate_results_dir="intermediate_results" \ + ~{if runtime_report + then "--runtime_report" + else "" + } \ + ~{if vcf_stats_report + then "--vcf_stats_report" + else "" + } + + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{tumor}" "~{tumor}.bai" "~{normal}" "~{ + normal + }.bai" + >>> + + output { + File vcf_output = "~{output_prefix}.vcf.gz" + File vcf_output_index = "~{output_prefix}.vcf.gz.tbi" + File gvcf_output = "~{output_prefix}.g.vcf.gz" + File gvcf_output_index = "~{output_prefix}.g.vcf.gz.tbi" + File? runtime_html = "logs/make_examples_runtime_by_region_report.html" + File? vcf_stats = "~{output_prefix}.visual_report.html" + } + + requirements { + container: "google/deepsomatic:1.10.0-gpu" + #container: "google/deepsomatic:1.10.0" + cpu: threads + memory: "64 GB" + disks: "~{disk_size_gb} GB" + gpu: true + } + + hints { + gpu: 1 + } +} + +task deepvariant { + meta { + description: "Call variants using DeepVariant" + outputs: { + vcf_output: "VCF file containing called variants", + vcf_output_index: "Index file for the called variants VCF", + gvcf_output: "gVCF file containing called variants", + gvcf_output_index: "Index file for the called variants gVCF", + runtime_html: "Optional HTML report of runtime metrics", + vcf_stats: "Optional HTML report of VCF statistics", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads for sample" + bam_index: "Index for input BAM file" + haploid_chromosomes: "List of chromosomes to be treated as haploid during variant calling" + output_prefix: "Prefix for output VCF and gVCF files" + model_type: { + description: "Type of model to use for variant calling", + choices: [ + "WGS", + "WES", + "PACBIO", + "ONT", + "FFPE_WGS", + "FFPE_WES", + "FFPE_WGS_TUMOR_ONLY", + "FFPE_WES_TUMOR_ONLY", + "WGS_TUMOR_ONLY", + "WES_TUMOR_ONLY", + "PACBIO_TUMOR_ONLY", + "ONT_TUMOR_ONLY", + ], + } + runtime_report: "Output make_examples_somatic runtime metrics and create a visual runtime report using runtime_by_region_vis." + vcf_stats_report: "Output a visual report (HTML) of statistics about the output VCF." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + File bam_index + Array[String] haploid_chromosomes = [ + "chrX", + "chrY", + ] + String output_prefix = "deepsomatic_output" + String model_type = "WGS" + Boolean runtime_report = false + Boolean vcf_stats_report = false + Int threads = 8 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(bam, "GB")) + 50 + + modify_disk_size_gb + + String filename = basename(bam) + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + ln -s "~{bam}" "~{filename}" + ln -s "~{bam_index}" "~{filename}.bai" + + mkdir "test" + TMPDIR="test" run_deepvariant \ + --model_type="~{model_type}" \ + --ref="$ref_fasta" \ + --reads="~{filename}" \ + --output_vcf="~{output_prefix}.vcf.gz" \ + --output_gvcf="~{output_prefix}.g.vcf.gz" \ + --num_shards="~{threads}" \ + --logging_dir="logs" \ + --intermediate_results_dir="intermediate_results" \ + --test_tmpdir="test" \ + ~{if runtime_report + then "--runtime_report" + else "" + } \ + ~{if vcf_stats_report + then "--vcf_stats_report" + else "" + } \ + --haploid_contigs="~{sep(",", haploid_chromosomes)}" + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{filename}" "~{filename}.bai" + >>> + + output { + File vcf_output = "~{output_prefix}.vcf.gz" + File vcf_output_index = "~{output_prefix}.vcf.gz.tbi" + File gvcf_output = "~{output_prefix}.g.vcf.gz" + File gvcf_output_index = "~{output_prefix}.g.vcf.gz.tbi" + File? runtime_html = "logs/runtime_by_region_vis.html" + File? vcf_stats = "logs/vcf_stats_report.html" + } + + requirements { + container: "google/deepvariant:1.10.0-gpu" + #container: "google/deepvariant:1.10.0" + cpu: threads + memory: "64 GB" + disks: "~{disk_size_gb} GB" + gpu: true + } + + hints { + gpu: 1 + } +} diff --git a/tools/gatk4.wdl b/tools/gatk4.wdl index dfc858605..b3adb16d9 100644 --- a/tools/gatk4.wdl +++ b/tools/gatk4.wdl @@ -1,5 +1,77 @@ -## [Homepage](https://software.broadinstitute.org/gatk) -version 1.1 +version 1.3 + +enum ref_confidence { + NONE, + GVCF, + BP_RESOLUTION, +} + +enum VariantMode { + SNP, + INDEL, + BOTH, +} + +struct Resource { + meta { + description: "Struct representing a resource to be used in GATK VariantRecalibrator." + help: "This includes both the VCF file for the resource and metadata about how to use the resource in building the recalibration model." + } + + parameter_meta { + name: "Name of the resource. This is an arbitrary string that is used to identify the resource in the recalibration model." + known: "Boolean indicating whether the resource is a known set of variants." + training: "Boolean indicating whether the resource is a training set of variants to be used for building the recalibration model." + truth: "Boolean indicating whether the resource is a truth set of variants to be used for building the recalibration model." + prior: "Prior probability for the resource." + vcf: "VCF file for the resource." + vcf_index: "Index file for the VCF." + } + + String name + Boolean known + Boolean training + Boolean truth + Float prior + File vcf + File vcf_index +} + +task resource_to_string { + meta { + description: "Converts a resource struct to a string representation for use in GATK commands." + outputs: { + res_string: "String representation of the input resource struct formatted for GATK VariantRecalibrator.", + res_vcf: "VCF file from the input resource struct", + res_vcf_index: "Index file for the VCF from the input resource struct", + } + } + + parameter_meta { + res: "Resource struct containing information about a resource to be used in GATK VariantRecalibrator." + } + + input { + Resource res + } + + command <<< + echo "~{res.name},known=~{res.known},training=~{res.training},truth=~{res.truth},prior=~{ + res.prior + } ~{basename(res.vcf)}" + >>> + + output { + String res_string = read_string(stdout()) + File res_vcf = res.vcf + File res_vcf_index = res.vcf_index + } + + requirements { + container: "ghcr.io/stjudecloud/util:3.0.3" + maxRetries: 1 + } +} task split_n_cigar_reads { meta { @@ -42,16 +114,27 @@ task split_n_cigar_reads { command <<< set -euo pipefail + bam_name=~{basename(bam)} + ln -sf "~{bam}" "$bam_name" + ln -sf "~{bam_index}" "$bam_name.bai" + + ref_fasta=~{sub(basename(fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{fasta}" "$ref_fasta.fa" + ln -sf "~{fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{dict}" "$ref_fasta.dict" + gatk \ --java-options "-Xms4000m -Xmx~{java_heap_size}g" \ SplitNCigarReads \ -R "~{fasta}" \ - -I "~{bam}" \ + -I "$bam_name" \ -O "~{prefix}.bam" \ -OBM true # GATK is unreasonable and uses the plain ".bai" suffix. mv "~{prefix}.bai" "~{prefix}.bam.bai" + rm "$bam_name" "$bam_name.bai" "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" >>> output { @@ -60,11 +143,11 @@ task split_n_cigar_reads { File split_n_reads_bam_md5 = "~{prefix}.bam.md5" } - runtime { + requirements { cpu: ncpu memory: "~{memory_gb} GB" disks: "~{disk_size_gb} GB" - container: "quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" maxRetries: 1 } } @@ -103,13 +186,14 @@ task base_recalibrator { File dict #@ except: SnakeCase File dbSNP_vcf - #@ except: SnakeCase + #@ except: SnakeCase, UnusedInput File dbSNP_vcf_index Array[File] known_indels_sites_vcfs + #@ except: UnusedInput Array[File] known_indels_sites_indices String outfile_name = basename(bam, ".bam") + ".recal.txt" Boolean use_original_quality_scores = false - Int memory_gb = 25 + Int memory_gb = 50 Int modify_disk_size_gb = 0 Int ncpu = 4 } @@ -118,13 +202,25 @@ task base_recalibrator { Int java_heap_size = ceil(memory_gb * 0.9) command <<< + set -euo pipefail + + bam_name=~{basename(bam)} + ln -sf "~{bam}" "$bam_name" + ln -sf "~{bam_index}" "$bam_name.bai" + + ref_fasta=~{sub(basename(fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{fasta}" "$ref_fasta.fa" + ln -sf "~{fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{dict}" "$ref_fasta.dict" + # shellcheck disable=SC2102 gatk \ --java-options \ - "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms4000m -Xmx~{java_heap_size}g" \ + "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms4000m -Xmx~{java_heap_size}g -XX:-UseContainerSupport" \ BaseRecalibratorSpark \ - -R "~{fasta}" \ - -I "~{bam}" \ + -R "$ref_fasta.fa" \ + -I "$bam_name" \ ~{if use_original_quality_scores then "--use-original-qualities" else "" @@ -133,13 +229,15 @@ task base_recalibrator { --known-sites "~{dbSNP_vcf}" \ ~{sep(" ", prefix("--known-sites ", squote(known_indels_sites_vcfs)))} \ --spark-master local[~{ncpu}] + + rm -rf "$bam_name" "$bam_name.bai" "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" >>> output { File recalibration_report = "~{outfile_name}" } - runtime { + requirements { cpu: ncpu memory: "~{memory_gb} GB" disks: "~{disk_size_gb} GB" @@ -175,7 +273,7 @@ task apply_bqsr { File recalibration_report String prefix = basename(bam, ".bam") Boolean use_original_quality_scores = false - Int memory_gb = 25 + Int memory_gb = 50 Int modify_disk_size_gb = 0 Int ncpu = 4 } @@ -186,19 +284,25 @@ task apply_bqsr { command <<< set -euo pipefail + bam_name=~{basename(bam)} + ln -sf "~{bam}" "$bam_name" + ln -sf "~{bam_index}" "$bam_name.bai" + # shellcheck disable=SC2102 gatk \ --java-options \ - "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Xmx~{java_heap_size}g" \ + "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Xmx~{java_heap_size}g -XX:-UseContainerSupport" \ ApplyBQSRSpark \ --spark-master local[~{ncpu}] \ - -I "~{bam}" \ + -I "$bam_name" \ ~{if use_original_quality_scores then "--use-original-qualities" else "" } \ -O "~{prefix}.bqsr.bam" \ --bqsr-recal-file "~{recalibration_report}" + + rm "$bam_name" "$bam_name.bai" >>> output { @@ -206,7 +310,7 @@ task apply_bqsr { File recalibrated_bam_index = "~{prefix}.bqsr.bam.bai" } - runtime { + requirements { cpu: ncpu memory: "~{memory_gb} GB" disks: "~{disk_size_gb} GB" @@ -237,6 +341,11 @@ task haplotype_caller { dict: "Dictionary file for FASTA format genome" dbSNP_vcf: "dbSNP VCF file" dbSNP_vcf_index: "dbSNP VCF index file" + reference_confidence: { + description: "Reference confidence mode to run HaplotypeCaller in.", + help: "If `NONE`, HaplotypeCaller will run in default mode and only output variant sites. If `GVCF`, HaplotypeCaller will run in GVCF mode and output both variant and non-variant sites with reference confidence scores. If `BP_RESOLUTION`, HaplotypeCaller will run in GVCF mode but output non-variant sites at base pair resolution instead of block resolution.", + external_help: "https://gatk.broadinstitute.org/hc/en-us/articles/360037225632-HaplotypeCaller#--emit-ref-confidence", + } prefix: "Prefix for the output VCF. The extension `.vcf.gz` will be added." use_soft_clipped_bases: "Use soft clipped bases in variant calling. Default is to ignore soft clipped bases." stand_call_conf: { @@ -259,6 +368,7 @@ task haplotype_caller { File dbSNP_vcf #@ except: SnakeCase File dbSNP_vcf_index + ref_confidence reference_confidence = ref_confidence.NONE String prefix = basename(bam, ".bam") Boolean use_soft_clipped_bases = false Int stand_call_conf = 20 @@ -270,13 +380,31 @@ task haplotype_caller { Int disk_size_gb = ceil(size(bam, "GB") * 2) + 30 + ceil(size(fasta, "GB")) + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) + String sample = basename(bam) + String snp_vcf = basename(dbSNP_vcf) + String snp_vcf_index = basename(dbSNP_vcf_index) + command <<< + set -euo pipefail + + ref_fasta=~{sub(basename(fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{fasta}" "$ref_fasta.fa" + ln -sf "~{fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{dict}" "$ref_fasta.dict" + + ln -sf "~{bam}" "~{sample}" + ln -sf "~{bam_index}" "~{sample}.bai" + + ln -sf "~{dbSNP_vcf}" "~{snp_vcf}" + ln -sf "~{dbSNP_vcf_index}" "~{snp_vcf_index}" + gatk \ --java-options \ "-Xms6000m -Xmx~{java_heap_size}g -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ HaplotypeCaller \ - -R "~{fasta}" \ - -I "~{bam}" \ + -R "$ref_fasta.fa" \ + -I "~{sample}" \ -L "~{interval_list}" \ -O "~{prefix}.vcf.gz" \ ~{if use_soft_clipped_bases @@ -284,7 +412,11 @@ task haplotype_caller { else "--dont-use-soft-clipped-bases" } \ --standard-min-confidence-threshold-for-calling ~{stand_call_conf} \ - --dbsnp "~{dbSNP_vcf}" + --dbsnp "~{snp_vcf}" \ + --emit-ref-confidence "~{reference_confidence}" + + rm -rf "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" "~{sample}" "~{sample + }.bai" "~{snp_vcf}" "~{snp_vcf_index}" >>> output { @@ -292,11 +424,11 @@ task haplotype_caller { File vcf_index = "~{prefix}.vcf.gz.tbi" } - runtime { + requirements { cpu: ncpu memory: "~{memory_gb} GB" disks: "~{disk_size_gb} GB" - container: "quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" maxRetries: 1 } } @@ -356,14 +488,29 @@ task variant_filtration { Int disk_size_gb = ceil(size(vcf, "GB") * 2) + 30 + modify_disk_size_gb command <<< + set -euo pipefail + + vcf_name=~{basename(vcf)} + ln -sf "~{vcf}" "$vcf_name" + ln -sf "~{vcf_index}" "$vcf_name.tbi" + + ref_fasta=~{sub(basename(fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{fasta}" "$ref_fasta.fa" + ln -sf "~{fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{dict}" "$ref_fasta.dict" + + gatk VariantFiltration \ - --R "~{fasta}" \ - --V "~{vcf}" \ + --R "$ref_fasta.fa" \ + --V "$vcf_name" \ --window ~{window} \ --cluster ~{cluster} \ ~{sep(" ", prefix("--filter-name ", quote(filter_names)))} \ ~{sep(" ", prefix("--filter-expression ", squote(filter_expressions)))} \ -O "~{prefix}.filtered.vcf.gz" + + rm "$vcf_name" "$vcf_name.tbi" "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" >>> output { @@ -371,11 +518,11 @@ task variant_filtration { File vcf_filtered_index = "~{prefix}.filtered.vcf.gz.tbi" } - runtime { + requirements { cpu: ncpu memory: "15 GB" disks: "~{disk_size_gb} GB" - container: "quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" maxRetries: 1 } } @@ -468,7 +615,7 @@ task mark_duplicates_spark { # shellcheck disable=SC2102 gatk MarkDuplicatesSpark \ - --java-options "-Xmx~{java_heap_size}g" \ + --java-options "-Xmx~{java_heap_size}g -XX:-UseContainerSupport" \ -I "~{bam}" \ -M "~{prefix}.metrics.txt" \ -O "~{if create_bam @@ -493,7 +640,7 @@ task mark_duplicates_spark { File mark_duplicates_metrics = "~{prefix}.metrics.txt" } - runtime { + requirements { cpu: ncpu memory: "~{memory_gb} GB" disks: "~{disk_size_gb} GB" @@ -501,3 +648,475 @@ task mark_duplicates_spark { maxRetries: 1 } } + +task apply_vqsr { + meta { + description: "Applies variant quality score recalibration to a VCF file." + external_help: "https://gatk.broadinstitute.org/hc/en-us/articles/35967650663195-ApplyVQSR" + outputs: { + vcf_recalibrated: "Recalibrated VCF file", + vcf_recalibrated_index: "Index file for the recalibrated VCF", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index for FASTA format genome" + reference_dict: "Dictionary file for FASTA format genome" + vcf: "Input VCF format file on which to apply variant quality score recalibration" + vcf_index: "VCF index file corresponding to the input VCF" + recal_file: "Recalibration file generated by the VariantRecalibrator tool" + recal_file_index: "Index file for the recalibration file" + tranches_file: "Tranches file generated by the VariantRecalibrator tool" + mode: { + description: "Variant type to apply variant quality score recalibration for.", + help: "If `BOTH`, then the same model will be applied to both SNPs and indels.", + } + prefix: "Prefix for the output recalibrated VCF. The extension `.recalibrated.vcf.gz` will be added." + truth_sensitivity_filter_level: { + description: "Truth sensitivity level at which to filter variants.", + help: "This corresponds to the `--truth-sensitivity-filter-level` argument of GATK ApplyVQSR. Default is 99.0, which means that variants will be filtered at the threshold that achieves 99.0% sensitivity on the truth set used to build the model.", + } + modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." + } + + input { + File reference_fasta + File reference_fasta_index + File reference_dict + File vcf + File vcf_index + File recal_file + #@ except: UnusedInput + File recal_file_index + File tranches_file + VariantMode mode = VariantMode.SNP + String prefix = basename(vcf, ".vcf.gz") + Float truth_sensitivity_filter_level = 99.0 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(vcf, "GB") * 2) + ceil(size(reference_fasta, "GB") * 2) + 30 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{sub(basename(reference_fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{reference_fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{reference_fasta}" "$ref_fasta.fa" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{reference_dict}" "$ref_fasta.dict" + + ln -sf "~{vcf}" "~{basename(vcf)}" + ln -sf "~{vcf_index}" "~{basename(vcf_index)}" + + gatk ApplyVQSR \ + -R "$ref_fasta.fa" \ + -V "~{basename(vcf)}" \ + --recal-file "~{recal_file}" \ + --tranches-file "~{tranches_file}" \ + --truth-sensitivity-filter-level ~{truth_sensitivity_filter_level} \ + --mode "~{mode}" \ + -O "~{prefix}.recalibrated.vcf.gz" + + rm -rf "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" "~{basename(vcf)}" "~{ + basename(vcf_index) + }" + >>> + + output { + File vcf_recalibrated = "~{prefix}.recalibrated.vcf.gz" + File vcf_recalibrated_index = "~{prefix}.recalibrated.vcf.gz.tbi" + } + + requirements { + cpu: 1 + memory: "25 GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + maxRetries: 1 + } +} + +task variant_recalibrator { + meta { + description: "Generates recalibration tables for variant quality score recalibration." + external_help: "https://gatk.broadinstitute.org/hc/en-us/articles/35967662766235-VariantRecalibrator" + outputs: { + recal_file: "Recalibration file containing the model generated by VariantRecalibrator", + recal_index: "Index file for the recalibration file", + tranches_file: "Tranches file containing information about the sensitivity and specificity of the model at different score thresholds", + rscript_file: "R script file for generating plots to evaluate the model.", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index for FASTA format genome" + reference_dict: "Dictionary file for FASTA format genome" + vcf: "Input VCF format file on which to perform variant quality score recalibration" + vcf_index: "VCF index file corresponding to the input VCF" + resource_vcfs: "List of VCF files corresponding to the resources in `resources`" + resource_vcf_indices: "List of VCF index files corresponding to the VCF files in `resource_vcfs`" + resources: "List of resources to use for building the recalibration model" + annotations: "List of annotations to use for building the recalibration model." + mode: "Variant type (SNP or INDEL) to build the recalibration model for. If `BOTH`, then separate models will be built for SNPs and indels." + prefix: "Prefix for the output recalibration files. The extensions `.recal` and `.tranches` will be added." + modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." + } + + input { + File reference_fasta + File reference_fasta_index + File reference_dict + File vcf + File vcf_index + Array[File] resource_vcfs + Array[File] resource_vcf_indices + Array[String] resources + Array[String] annotations = [ + "QD", + "MQ", + "MQRankSum", + "ReadPosRankSum", + "FS", + "SOR", + ] + VariantMode mode = VariantMode.SNP + String prefix = basename(vcf, ".vcf.gz") + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(vcf, "GB") * 2) + ceil(size(reference_fasta, "GB") * 2) + 30 + + modify_disk_size_gb + + #@ except: ShellCheck + command <<< + set -euo pipefail + + ref_fasta=~{sub(basename(reference_fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{reference_fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{reference_fasta}" "$ref_fasta.fa" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{reference_dict}" "$ref_fasta.dict" + + ln -sf "~{vcf}" "~{basename(vcf)}" + ln -sf "~{vcf_index}" "~{basename(vcf_index)}" + + for vcf in ~{sep(" ", resource_vcfs)}; do + ln -sf "~{vcf}" "$(basename "$vcf")" + done + + for vcf_index in ~{sep(" ", resource_vcf_indices)}; do + ln -sf "~{vcf_index}" "$(basename "$vcf_index")" + done + + gatk VariantRecalibrator \ + -R "$ref_fasta.fa" \ + -V "~{basename(vcf)}" \ + --mode "~{mode}" \ + -O "~{prefix}.recal" \ + --tranches-file "~{prefix}.tranches" \ + --rscript-file "~{prefix}.plots.R" \ + --dont-run-rscript \ + ~{sep(" ", prefix("--resource:", resources))} \ + ~{sep(" ", prefix("-an ", quote(annotations)))} + + rm -rf "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" "~{basename(vcf)}" "~{ + basename(vcf_index) + }" + for vcf in ~{sep(" ", resource_vcfs)}; do + rm -rf "$(basename "$vcf")" + done + + for vcf_index in ~{sep(" ", resource_vcf_indices)}; do + rm -rf "$(basename "$vcf_index")" + done + >>> + + output { + File recal_file = "~{prefix}.recal" + File recal_index = "~{prefix}.recal.idx" + File tranches_file = "~{prefix}.tranches" + File rscript_file = "~{prefix}.plots.R" + } + + requirements { + cpu: 1 + memory: "25 GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + maxRetries: 1 + } +} + +task calculate_genotype_posteriors { + meta { + description: "Calculates posterior genotype probabilities for a VCF file using population allele frequencies." + external_help: "https://gatk.broadinstitute.org/hc/en-us/articles/35967653366299-CalculateGenotypePosteriors" + outputs: { + vcf_posteriors: "VCF file with updated genotype posteriors", + vcf_posteriors_index: "Index file for the VCF with updated genotype posteriors", + } + } + + parameter_meta { + vcf: "Input VCF format file on which to calculate genotype posteriors" + vcf_index: "VCF index file corresponding to the input VCF" + supporting_vcf: "VCF file containing population allele frequencies to use as support for calculating genotype posteriors" + supporting_vcf_index: "VCF index file corresponding to the supporting VCF" + prefix: "Prefix for the output VCF with updated genotype posteriors. The extension `.posteriors.vcf.gz` will be added." + modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation." + } + + input { + File vcf + File vcf_index + File supporting_vcf + File supporting_vcf_index + String prefix = basename(vcf, ".vcf.gz") + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(vcf, "GB") * 2) + 30 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ln -sf "~{vcf}" "~{basename(vcf)}" + ln -sf "~{vcf_index}" "~{basename(vcf_index)}" + ln -sf "~{supporting_vcf}" "supporting.vcf.gz" + ln -sf "~{supporting_vcf_index}" "supporting.vcf.gz.tbi" + + gatk CalculateGenotypePosteriors \ + -V "~{basename(vcf)}" \ + -supporting "supporting.vcf.gz" \ + -O "~{prefix}.posteriors.vcf.gz" + + rm -rf "~{basename(vcf)}" "~{basename(vcf_index)}" "supporting.vcf.gz" "supporting.vcf.gz.tbi" + >>> + + output { + File vcf_posteriors = "~{prefix}.posteriors.vcf.gz" + File vcf_posteriors_index = "~{prefix}.posteriors.vcf.gz.tbi" + } + + requirements { + cpu: 1 + memory: "25 GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + maxRetries: 1 + } +} + +task genotype_gvcfs { + meta { + description: "Generates per-sample GVCF files from BAM files using GATK's HaplotypeCaller in GVCF mode." + external_help: "https://gatk.broadinstitute.org/hc/en-us/articles/35967678260379-GenotypeGVCFs" + outputs: { + vcf: "VCF file containing variant and non-variant sites with genotype likelihoods", + vcf_index: "Index file for the VCF", + } + } + + parameter_meta { + gvcf: "Input GVCF format file containing GVCF records to genotype" + gvcf_index: "GVCF index file corresponding to the input GVCF" + fasta: "Reference genome in FASTA format" + fasta_index: "Index for FASTA format genome" + dict: "Dictionary file for FASTA format genome" + prefix: "Prefix for the output GVCF. The extension `.g.vcf.gz` will be added." + modify_disk_size_gb: "Add to or subtract from dynamic memory allocation. Default memory is determined by the size of the inputs. Specified in GB." + } + + input { + File gvcf + File gvcf_index + File fasta + File fasta_index + File dict + String prefix + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(gvcf, "GB") * 2) + 30 + ceil(size(fasta, "GB")) + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{sub(basename(fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{fasta}" "$ref_fasta.fa" + ln -sf "~{fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{dict}" "$ref_fasta.dict" + + ln -sf "~{gvcf}" "~{basename(gvcf)}" + ln -sf "~{gvcf_index}" "~{basename(gvcf_index)}" + + gatk \ + --java-options "-Xmx4g" \ + GenotypeGVCFs \ + -R "$ref_fasta.fa" \ + -V "~{basename(gvcf)}" \ + -O "~{prefix}.g.vcf.gz" \ + + rm -rf "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" "~{basename(gvcf)}" "~{ + basename(gvcf_index) + }" + >>> + + output { + File vcf = "~{prefix}.g.vcf.gz" + File vcf_index = "~{prefix}.g.vcf.gz.tbi" + } + + requirements { + cpu: 1 + memory: "25 GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + } +} + +workflow germline_variant_calling_wf { + meta { + description: "Workflow for calling germline variants using GATK's HaplotypeCaller." + outputs: { + raw_gvcf: "Raw gVCF file output by HaplotypeCaller", + raw_gvcf_index: "Index file for the raw gVCF", + raw_vcf: "Raw VCF file output by GenotypeGVCFs", + raw_vcf_index: "Index file for the raw VCF", + recalibrated_vcf: "Recalibrated VCF file after applying VQSR", + recalibrated_vcf_index: "Index file for the recalibrated VCF", + vcf_final: "Final VCF file after calculating genotype posteriors", + vcf_final_index: "Index file for the final VCF", + } + } + + parameter_meta { + bam: "Input BAM format file on which to perform germline variant calling" + bam_index: "BAM index file corresponding to the input BAM" + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index for FASTA format genome" + reference_dict: "Dictionary file for FASTA format genome" + dbSNP_vcf: "dbSNP VCF file" + dbSNP_vcf_index: "dbSNP VCF index file" + interval_list: "Interval list indicating regions in which to call variants" + known_indels_sites_vcfs: "List of VCF files containing known indel sites to use for base quality score recalibration" + known_indels_sites_indices: "List of VCF index files corresponding to the VCF files in `known_indels_sites_vcfs`" + resources: { + description: "List of resources to use for building the variant quality score recalibration model.", + help: " Each resource should be a tuple containing the name of the resource, the path to the VCF file for the resource, and the path to the VCF index file for the resource.", + } + prefix: "Prefix for the output files." + } + + input { + File bam + File bam_index + File reference_fasta + File reference_fasta_index + File reference_dict + #@ except: SnakeCase + File dbSNP_vcf + #@ except: SnakeCase + File dbSNP_vcf_index + File interval_list + Array[File] known_indels_sites_vcfs + Array[File] known_indels_sites_indices + Array[Resource] resources + String prefix = basename(bam, ".bam") + } + + scatter (resource in resources) { + call resource_to_string { + res = resource, + } + } + + call base_recalibrator { + bam, + bam_index, + fasta = reference_fasta, + fasta_index = reference_fasta_index, + dict = reference_dict, + dbSNP_vcf, + dbSNP_vcf_index, + known_indels_sites_vcfs, + known_indels_sites_indices, + outfile_name = prefix + ".recalibration_report.txt", + } + + call apply_bqsr { + bam, + bam_index, + recalibration_report = base_recalibrator.recalibration_report, + prefix = prefix + ".recal", + } + + call haplotype_caller { + bam = apply_bqsr.recalibrated_bam, + bam_index = apply_bqsr.recalibrated_bam_index, + interval_list, + fasta = reference_fasta, + fasta_index = reference_fasta_index, + dict = reference_dict, + dbSNP_vcf, + dbSNP_vcf_index, + prefix, + reference_confidence = ref_confidence.GVCF, + } + + call genotype_gvcfs { + gvcf = haplotype_caller.vcf, + gvcf_index = haplotype_caller.vcf_index, + fasta = reference_fasta, + fasta_index = reference_fasta_index, + dict = reference_dict, + prefix, + } + + call variant_recalibrator { + reference_fasta, + reference_fasta_index, + reference_dict, + vcf = genotype_gvcfs.vcf, + vcf_index = genotype_gvcfs.vcf_index, + resources = resource_to_string.res_string, + resource_vcfs = resource_to_string.res_vcf, + resource_vcf_indices = resource_to_string.res_vcf_index, + } + + call apply_vqsr { + reference_fasta, + reference_fasta_index, + reference_dict, + vcf = genotype_gvcfs.vcf, + vcf_index = genotype_gvcfs.vcf_index, + recal_file = variant_recalibrator.recal_file, + recal_file_index = variant_recalibrator.recal_index, + tranches_file = variant_recalibrator.tranches_file, + prefix = prefix + ".vqsr", + } + + call calculate_genotype_posteriors { + vcf = apply_vqsr.vcf_recalibrated, + vcf_index = apply_vqsr.vcf_recalibrated_index, + supporting_vcf = dbSNP_vcf, + supporting_vcf_index = dbSNP_vcf_index, + prefix = prefix + ".posteriors", + } + + output { + File raw_gvcf = haplotype_caller.vcf + File raw_gvcf_index = haplotype_caller.vcf_index + File raw_vcf = genotype_gvcfs.vcf + File raw_vcf_index = genotype_gvcfs.vcf_index + File recalibrated_vcf = apply_vqsr.vcf_recalibrated + File recalibrated_vcf_index = apply_vqsr.vcf_recalibrated_index + File vcf_final = calculate_genotype_posteriors.vcf_posteriors + File vcf_final_index = calculate_genotype_posteriors.vcf_posteriors_index + } +} diff --git a/tools/hisat2.wdl b/tools/hisat2.wdl new file mode 100644 index 000000000..87cd74c38 --- /dev/null +++ b/tools/hisat2.wdl @@ -0,0 +1,228 @@ +version 1.2 + +task align { + meta { + description: "Align RNA-seq reads against a reference genome using HISAT2" + deprecated: true + outputs: { + alignments: "The output alignment file in SAM format", + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with HISAT2" + reference_index: "The HISAT2 index files for the reference genome" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with HISAT2" + output_name: "The name of the output alignment file" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File reference_index + File? read_two_fastq_gz + String output_name = "aligned.bam" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil((size(read_one_fastq_gz, "GB") + size(read_two_fastq_gz, "GB") + ) * 2) + ceil(size(reference_index, "GB") * 5) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + mkdir hisat2_db + tar -C hisat2_db -xzf "~{reference_index}" --no-same-owner + PREFIX=$(basename hisat2_db/*.1.ht2 ".1.ht2") + + hisat2 \ + -q \ + -p ~{threads} \ + -x "hisat2_db/$PREFIX" \ + -1 "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) + then "-2 \"~{read_two_fastq_gz}\"" + else "" + } \ + | samtools view -bS - > "~{output_name}" + + rm -r hisat2_db + >>> + + output { + File alignments = "~{output_name}" + } + + requirements { + cpu: threads + memory: "120 GB" + disks: "~{disk_size_gb} GB" + container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with HISAT2" + deprecated: true + outputs: { + reference_index: "The HISAT2 index files for the reference genome", + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + snp: "List of SNPs" + haplotype: "List of haplotypes" + splice_site: "List of splice sites. Use with `exon`." + exon: "List of exons. Use with `splice_site`." + repeat_ref: "" + repeat_info: "" + repeat_snp: "" + repeat_haplotype: "" + bmax: "Maximum number of suffixes allowed in a block" + seed: "Seed for psuedo-random number generator" + bmaxdivn: "Maximum number of suffixes allowed in a block, expressed as a fraction of the length of the reference" + index_base_name: "The base name for the output index files" + force_large_index: "Force creation of a large index" + disable_auto_fitting: "Disable automatic fitting of index parameters" + nodc: "Disable difference-cover sample" + no_ref: "Do not build bitpacked version of reference sequence for paired-end alignment" + just_ref: "Build only the bitpacked version of reference sequence for paired-end alignment" + threads: "Number of threads to use for indexing" + dcv: "Period for the difference-cover sample. A larger period uses less memory, but may be slower. Must be a power of 2, no greater than 4096." + offrate: "The off-rate for the FM index" + ftabchars: "The lookup table to calculate initial BW range with respect to the first N characters of the query" + localoffrate: "The off-rate for the local FM index" + localftabchars: "The lookup table to calculate initial BW range for the local FM" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + File? snp + File? haplotype + File? splice_site + File? exon + File? repeat_ref + File? repeat_info + File? repeat_snp + File? repeat_haplotype + Int? bmax + Int? seed + Int? bmaxdivn = 4 + String index_base_name = "hisat2_index" + Boolean force_large_index = false + Boolean disable_auto_fitting = false + Boolean nodc = false + Boolean no_ref = false + Boolean just_ref = false + Int threads = 1 + Int dcv = 1024 + Int offrate = 5 + Int ftabchars = 10 + Int localoffrate = 3 + Int localftabchars = 6 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + hisat2-build \ + ~{if force_large_index + then "--large-index" + else "" + } \ + ~{if disable_auto_fitting + then "--disable-auto-fitting" + else "" + } \ + -p ~{threads} \ + ~{if defined(bmax) + then "--bmax \"~{bmax}\"" + else "" + } \ + ~{if defined(bmaxdivn) + then "--bmaxdivn \"~{bmaxdivn}\"" + else "" + } \ + ~{if !nodc + then "--dcv \"~{dcv}\"" + else "" + } \ + ~{if no_ref + then "--no-ref" + else "" + } \ + ~{if just_ref + then "--just-ref" + else "" + } \ + --offrate "~{offrate}" \ + --ftabchars "~{ftabchars}" \ + --localoffrate "~{localoffrate}" \ + --localftabchars "~{localftabchars}" \ + ~{if defined(snp) + then "--snp \"~{snp}\"" + else "" + } \ + ~{if defined(haplotype) + then "--haplotype \"~{haplotype}\"" + else "" + } \ + ~{if defined(splice_site) + then "--ss \"~{splice_site}\"" + else "" + } \ + ~{if defined(exon) + then "--exon \"~{exon}\"" + else "" + } \ + ~{if defined(repeat_ref) + then "--repeat-ref \"~{repeat_ref}\"" + else "" + } \ + ~{if defined(repeat_info) + then "--repeat-info \"~{repeat_info}\"" + else "" + } \ + ~{if defined(repeat_snp) + then "--repeat-snp \"~{repeat_snp}\"" + else "" + } \ + ~{(if defined(repeat_haplotype) + then "--repeat-haplotype \"~{repeat_haplotype}\"" + else "" + )} \ + ~{if defined(seed) + then "--seed \"~{seed}\"" + else "" + } \ + "$ref_fasta" \ + "~{index_base_name}" + + tar -czf "~{index_base_name}.tar.gz" "~{index_base_name}"* + + rm -r "$ref_fasta" + >>> + + output { + File reference_index = "~{index_base_name}.tar.gz" + } + + requirements { + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + container: "ghcr.io/stjudecloud/hisat2:branch-minimap2-2.2.1-0" + } +} diff --git a/tools/manta.wdl b/tools/manta.wdl new file mode 100644 index 000000000..988edacac --- /dev/null +++ b/tools/manta.wdl @@ -0,0 +1,193 @@ +version 1.2 + +task manta_germline { + meta { + description: "Run Manta structural variant and indel caller" + outputs: { + manta_output: "Directory containing Manta variant calls", + log_file: "Log file from the Manta workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads" + bam_index: "Index file for the input BAM file" + calling_regions_bed: "Optional BED file specifying regions to call variants" + calling_regions_index: "Index file for the calling regions BED file" + output_dir: "Directory to store Manta output" + exome: "Whether to run Manta in exome mode, which disables high depth filters for calling variants in exome data" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + File bam_index + File? calling_regions_bed + #@ except: UnusedInput + File? calling_regions_index + String output_dir = "manta_output" + Boolean exome = false + Int threads = 20 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(bam, "GB")) + 20 + + modify_disk_size_gb + + String filename = basename(bam) + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + ln -s "~{bam}" "~{filename}" + ln -s "~{bam_index}" "~{filename}.bai" + + configManta.py \ + --bam "~{filename}" \ + --referenceFasta "$ref_fasta" \ + ~{if defined(calling_regions_bed) + then "--callRegions '" + calling_regions_bed + "'" + else "" + } \ + ~{if exome + then "--exome" + else "" + } \ + --runDir "~{output_dir}" + + "~{output_dir}/runWorkflow.py" -j "~{threads}" + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{filename}" "~{filename}.bai" + >>> + + output { + Directory manta_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/manta:1.6.0--py27h9948957_6" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task manta_somatic { + meta { + description: "Run Manta structural variant and indel caller in somatic mode" + outputs: { + manta_output: "Directory containing Manta variant calls", + indel_candidates: "VCF file with candidate small indels", + indel_candidates_index: "Index file for the candidate small indels VCF", + sv_candidates: "VCF file with candidate structural variants", + sv_candidates_index: "Index file for the candidate structural variants VCF", + somatic_sv: "VCF file with candidate somatic structural variants", + somatic_sv_index: "Index file for the candidate somatic structural variants VCF", + diploid_sv: "VCF file with candidate diploid structural variants", + diploid_sv_index: "Index file for the candidate diploid structural variants VCF", + log_file: "Log file from the Manta workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + tumor_bam: "Input BAM file with aligned reads from tumor sample" + tumor_bam_index: "Index file for the tumor BAM file" + normal_bam: "Input BAM file with aligned reads from normal sample" + normal_bam_index: "Index file for the normal BAM file" + calling_regions_bed: "Optional BED file specifying regions to call variants" + calling_regions_index: "Index file for the calling regions BED file" + output_dir: "Directory to store Manta output" + exome: "Whether to run Manta in exome mode, which disables high depth filters for calling variants in exome data" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File tumor_bam + File tumor_bam_index + File normal_bam + File normal_bam_index + File? calling_regions_bed + #@ except: UnusedInput + File? calling_regions_index + String output_dir = "manta_output" + Boolean exome = false + Int threads = 20 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(tumor_bam, "GB")) + + ceil(size(normal_bam, "GB")) + 20 + modify_disk_size_gb + + String tumor = basename(tumor_bam) + String normal = basename(normal_bam) + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + ln -sf "~{tumor_bam}" "~{tumor}" + ln -sf "~{tumor_bam_index}" "~{tumor}.bai" + ln -sf "~{normal_bam}" "~{normal}" + ln -sf "~{normal_bam_index}" "~{normal}.bai" + + configManta.py \ + --normalBam "~{normal}" \ + --tumorBam "~{tumor}" \ + --referenceFasta "$ref_fasta" \ + ~{if exome + then "--exome" + else "" + } \ + ~{if defined(calling_regions_bed) + then "--callRegions '" + calling_regions_bed + "'" + else "" + } \ + --runDir "~{output_dir}" + + "~{output_dir}/runWorkflow.py" -j "~{threads}" + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{tumor}" "~{tumor}.bai" "~{normal}" "~{ + normal + }.bai" + >>> + + output { + Directory manta_output = output_dir + File indel_candidates = "~{output_dir}/results/variants/candidateSmallIndels.vcf.gz" + File indel_candidates_index = "~{output_dir}/results/variants/candidateSmallIndels.vcf.gz.tbi" + File sv_candidates = "~{output_dir}/results/variants/candidateSV.vcf.gz" + File sv_candidates_index = "~{output_dir}/results/variants/candidateSV.vcf.gz.tbi" + File somatic_sv = "~{output_dir}/results/variants/somaticSV.vcf.gz" + File somatic_sv_index = "~{output_dir}/results/variants/somaticSV.vcf.gz.tbi" + File diploid_sv = "~{output_dir}/results/variants/diploidSV.vcf.gz" + File diploid_sv_index = "~{output_dir}/results/variants/diploidSV.vcf.gz.tbi" + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/manta:1.6.0--py27h9948957_6" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/md5sum.wdl b/tools/md5sum.wdl index 47f07e21b..e2f901f0d 100755 --- a/tools/md5sum.wdl +++ b/tools/md5sum.wdl @@ -20,7 +20,7 @@ task compute_checksum { } Float file_size = size(file, "GB") - Int disk_size_gb = ceil(file_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(file_size) + 30 + modify_disk_size_gb String outfile_name = basename(file) + ".md5" diff --git a/tools/minimap2.wdl b/tools/minimap2.wdl new file mode 100644 index 000000000..c722ffa05 --- /dev/null +++ b/tools/minimap2.wdl @@ -0,0 +1,196 @@ +version 1.2 + +task align { + meta { + description: "Align DNA or mRNA sequences against a large reference database" + outputs: { + alignments: "The output alignment file in SAM or PAF format", + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with minimap2" + reference_index: "The minimap2 index file for the reference genome" + read_group: "The read group string to be included in the SAM header. Format: '@RG\\tID:foo\\tSM:bar'" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with minimap2" + preset: { + description: "Minimap2 preset for alignment", + external_help: "https://lh3.github.io/minimap2/minimap2.html#8", + options: [ + "sr", + "map-ont", + "lr:hq", + "map-hifi", + "map-pb", + "map-iclr", + "asm5", + "asm10", + "asm20", + "splice", + "splice:hq", + "splice:sr", + "ava-pb", + "ava-ont", + ], + } + output_name: "The name of the output alignment file" + output_paf: "If true, output in PAF format instead of BAM" + cigar_in_paf: "If true and outputting PAF, include CIGAR strings in the PAF output" + ignore_base_quality: "If true, ignore base quality scores during alignment" + output_md_tag: "If true, include MD tags in the SAM output" + eqx: "If true, use =/X CIGAR operators instead of M" + soft_clip: "If true, use soft clipping for secondary alignments in SAM format" + secondary_alignments: "If true, report secondary alignments" + seed: "Seed value for the minimap2 aligner" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File reference_index + String read_group + File? read_two_fastq_gz + String? preset = "sr" + String output_name = "aligned.bam" + Boolean output_paf = false + Boolean cigar_in_paf = true + Boolean ignore_base_quality = false + Boolean output_md_tag = true + Boolean eqx = false + Boolean soft_clip = true + Boolean secondary_alignments = true + Int seed = 11 + Int threads = 3 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil((size(read_one_fastq_gz, "GB") + size(read_two_fastq_gz, "GB") + ) * 3) + ceil(size(reference_index, "GB")) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + minimap2 \ + ~{if defined(preset) + then "-x \"~{preset}\"" + else "" + } \ + ~{if output_paf + then "" + else "-a" + } \ + ~{if output_paf && cigar_in_paf + then "-c" + else "" + } \ + ~{if ignore_base_quality + then "-Q" + else "" + } \ + ~{if output_md_tag + then "--MD" + else "" + } \ + ~{if eqx + then "-X" + else "" + } \ + ~{if soft_clip + then "-Y" + else "" + } \ + ~{if secondary_alignments + then "--secondary=yes" + else "--secondary=no" + } \ + -t ~{threads} \ + --seed ~{seed} \ + -R "~{read_group}" \ + "~{reference_index}" \ + "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) + then "\"~{read_two_fastq_gz}\"" + else "" + } \ + | if ~{output_paf}; then + cat - > "~{output_name}" + else + samtools view -b - > "~{output_name}" + fi + >>> + + output { + File alignments = output_name + } + + requirements { + container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Create a minimap2 index for a reference genome" + outputs: { + reference_index: "The generated minimap2 index file", + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + alt_contigs: "Optional file containing a list of alternative contigs" + index_name: "The name of the output index file" + minimizer_kmer_size: "K-mer size for minimizer indexing" + minimizer_window_size: "Window size for minimizer indexing" + threads: "Number of threads to use for indexing" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File reference_fasta + File? alt_contigs + String index_name = "reference.mmi" + Int minimizer_kmer_size = 15 + Int minimizer_window_size = 10 + Int threads = 3 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB")) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + minimap2 \ + -k ~{minimizer_kmer_size} \ + -w ~{minimizer_window_size} \ + ~{if defined(alt_contigs) + then "--alt \"~{alt_contigs}\"" + else "" + } \ + -t ~{threads} \ + -d "~{index_name}" \ + "$ref_fasta" + + rm -r "$ref_fasta" + >>> + + output { + File reference_index = index_name + } + + requirements { + container: "ghcr.io/stjudecloud/minimap2:branch-minimap2-2.30-0" + cpu: threads + memory: "16 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/mosdepth.wdl b/tools/mosdepth.wdl index 63606083d..7f37661ee 100644 --- a/tools/mosdepth.wdl +++ b/tools/mosdepth.wdl @@ -38,7 +38,7 @@ task coverage { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< set -euo pipefail diff --git a/tools/mutect2.wdl b/tools/mutect2.wdl new file mode 100644 index 000000000..cde1d29ef --- /dev/null +++ b/tools/mutect2.wdl @@ -0,0 +1,448 @@ +version 1.3 + +workflow mutect2_wf { + meta { + description: "Workflow for calling somatic variants using GATK Mutect2 and filtering with FilterMutectCalls" + outputs: { + filtered_somatic_vcf: "VCF file with filtered somatic variants from Mutect2", + filtered_somatic_vcf_index: "Index file for the filtered Mutect2 somatic variants VCF", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + reference_fasta_dict: "Dictionary file for the reference genome FASTA" + normal_bam: "Input BAM file with aligned reads for normal sample" + normal_bam_index: "Index file for the normal BAM file" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + tumor_bam_index: "Index file for the tumor BAM file" + variant_vcf: "VCF file with variants and allele frequencies to summarize pileups over." + variant_vcf_index: "Index file for the variant VCF" + intervals: "One or more genomic intervals over which to operate. Often the same file as `variants`" + intervals_index: "Index file for the intervals file" + germline_resource_vcf: "Optional VCF file with germline variants for Mutect2, recommended to be from gnomAD or similar population resource" + germline_resource_vcf_index: "Index file for the germline resource VCF" + panel_of_normals_vcf: "Optional VCF file with panel of normals for Mutect2, recommended to be generated from a large set of normal samples processed with Mutect2" + panel_of_normals_vcf_index: "Index file for the panel of normals VCF" + normal_sample_name: "Name of the normal sample" + tumor_sample_name: "Name of the tumor sample" + output_prefix: "Prefix for output files. The extensions '.vcf.gz' and '.vcf.gz.tbi' will be added." + } + + input { + File reference_fasta + File reference_fasta_index + File reference_fasta_dict + File normal_bam + File normal_bam_index + File tumor_bam + File tumor_bam_index + File variant_vcf + File variant_vcf_index + File intervals + File intervals_index + File? germline_resource_vcf + File? germline_resource_vcf_index + File? panel_of_normals_vcf + File? panel_of_normals_vcf_index + String normal_sample_name = basename(normal_bam, ".bam") + String tumor_sample_name = basename(tumor_bam, ".bam") + String output_prefix = basename(tumor_bam, ".bam") + "_v_" + basename(normal_bam, ".bam" + ) + } + + call mutect2 { + reference_fasta, + reference_fasta_index, + reference_fasta_dict, + normal_bam, + normal_bam_index, + tumor_bam, + tumor_bam_index, + normal_sample_name, + tumor_sample_name, + output_prefix = output_prefix + "_unfiltered", + germline_resource_vcf, + germline_resource_vcf_index, + panel_of_normals_vcf, + panel_of_normals_vcf_index, + } + + call get_pileup_summaries as get_tumor_pileups { + bam = tumor_bam, + bam_index = tumor_bam_index, + intervals, + intervals_index, + variants = variant_vcf, + variants_index = variant_vcf_index, + prefix = output_prefix + "_tumor", + reference_fasta, + reference_fasta_index, + reference_fasta_dict, + } + + call get_pileup_summaries as get_normal_pileups { + bam = normal_bam, + bam_index = normal_bam_index, + intervals, + intervals_index, + variants = variant_vcf, + variants_index = variant_vcf_index, + prefix = output_prefix + "_normal", + reference_fasta, + reference_fasta_index, + reference_fasta_dict, + } + + call calculate_contamination { + tumor_pileups = get_tumor_pileups.pileup_summaries, + normal_pileups = get_normal_pileups.pileup_summaries, + prefix = output_prefix, + } + + call filter_mutect { + unfiltered_somatic_vcf = mutect2.somatic_vcf, + unfiltered_somatic_vcf_index = mutect2.somatic_vcf_index, + unfiltered_somatic_vcf_stats = mutect2.stats, + reference_fasta, + reference_fasta_index, + reference_fasta_dict, + contamination_table = calculate_contamination.contamination_table, + maf_segments = calculate_contamination.maf_segments, + prefix = output_prefix, + } + + output { + File filtered_somatic_vcf = filter_mutect.filtered_somatic_vcf + File filtered_somatic_vcf_index = filter_mutect.filtered_somatic_vcf_index + } +} + +task mutect2 { + meta { + description: "Run GATK Mutect2 somatic variant calling workflow" + outputs: { + somatic_vcf: "VCF file with somatic variants called by Mutect2", + somatic_vcf_index: "Index file for the Mutect2 somatic variants VCF", + stats: "VCF file with statistics about the Mutect2 somatic variant calls", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + reference_fasta_dict: "Dictionary file for the reference genome FASTA" + normal_bam: "Input BAM file with aligned reads for normal sample" + normal_bam_index: "Index file for the normal BAM file" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + tumor_bam_index: "Index file for the tumor BAM file" + germline_resource_vcf: "Optional VCF file with germline variants for Mutect2, recommended to be from gnomAD or similar population resource" + germline_resource_vcf_index: "Index file for the germline resource VCF" + panel_of_normals_vcf: "Optional VCF file with panel of normals for Mutect2, recommended to be generated from a large set of normal samples processed with Mutect2" + panel_of_normals_vcf_index: "Index file for the panel of normals VCF" + normal_sample_name: "Name of the normal sample" + tumor_sample_name: "Name of the tumor sample" + output_prefix: "Prefix for output file. The extension '.vcf.gz' will be added." + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File reference_fasta_dict + File normal_bam + File normal_bam_index + File tumor_bam + File tumor_bam_index + File? germline_resource_vcf + #@ except: UnusedInput + File? germline_resource_vcf_index + File? panel_of_normals_vcf + #@ except: UnusedInput + File? panel_of_normals_vcf_index + String normal_sample_name = basename(normal_bam, ".bam") + String tumor_sample_name = basename(tumor_bam, ".bam") + String output_prefix = "mutect2_somatic_variants" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(normal_bam, "GB") + * 2) + ceil(size(tumor_bam, "GB") * 2) + 20 + modify_disk_size_gb + + String tumor = basename(tumor_bam) + String normal = basename(normal_bam) + + command <<< + set -euo pipefail + + ref_fasta=~{sub(basename(reference_fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{reference_fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{reference_fasta}" "$ref_fasta.fa" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{reference_fasta_dict}" "$ref_fasta.dict" + + ln -sf "~{tumor_bam}" "~{tumor}" + ln -sf "~{tumor_bam_index}" "~{tumor}.bai" + ln -sf "~{normal_bam}" "~{normal}" + ln -sf "~{normal_bam_index}" "~{normal}.bai" + + gatk Mutect2 \ + -R "$ref_fasta.fa" \ + -I "~{tumor}" \ + -I "~{normal}" \ + -normal "~{normal_sample_name}" \ + -tumor "~{tumor_sample_name}" \ + ~{if defined(germline_resource_vcf) + then "-germline-resource '" + germline_resource_vcf + "'" + else "" + } \ + ~{if defined(panel_of_normals_vcf) + then "-panel-of-normals '" + panel_of_normals_vcf + "'" + else "" + } \ + -O "~{output_prefix}.vcf.gz" \ + --native-pair-hmm-threads "~{threads}" + + rm -rf "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" "~{tumor}" "~{tumor}.bai" "~{ + normal + }" "~{normal}.bai" + >>> + + output { + File somatic_vcf = "~{output_prefix}.vcf.gz" + File somatic_vcf_index = "~{output_prefix}.vcf.gz.tbi" + File stats = "~{output_prefix}.vcf.gz.stats" + } + + requirements { + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task filter_mutect { + meta { + description: "Run GATK FilterMutectCalls to filter Mutect2 somatic variant calls" + outputs: { + filtered_somatic_vcf: "VCF file with filtered somatic variants from Mutect2", + filtered_somatic_vcf_index: "Index file for the filtered Mutect2 somatic variants VCF", + } + } + + parameter_meta { + unfiltered_somatic_vcf: "Input VCF file with unfiltered somatic variants from Mutect2" + unfiltered_somatic_vcf_index: "Index file for the unfiltered Mutect2 somatic variants VCF" + unfiltered_somatic_vcf_stats: "VCF file with statistics about the unfiltered Mutect2 somatic variant calls" + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + reference_fasta_dict: "Dictionary file for the reference genome FASTA" + contamination_table: "Table file with estimated contamination fraction and related metrics from GATK CalculateContamination" + maf_segments: "Table file with segmented genomic intervals for MAF calculation from GATK CalculateContamination" + prefix: "Prefix for output file. The extension '.vcf.gz' will be added." + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File unfiltered_somatic_vcf + File unfiltered_somatic_vcf_index + File unfiltered_somatic_vcf_stats + File reference_fasta + File reference_fasta_index + File reference_fasta_dict + File contamination_table + File maf_segments + String prefix = basename(unfiltered_somatic_vcf, ".vcf.gz") + "_filtered" + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size( + unfiltered_somatic_vcf, "GB" + ) * 2) + 20 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{sub(basename(reference_fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{reference_fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{reference_fasta}" "$ref_fasta.fa" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{reference_fasta_dict}" "$ref_fasta.dict" + + ln -sf "~{unfiltered_somatic_vcf}" "~{basename(unfiltered_somatic_vcf)}" + ln -sf "~{unfiltered_somatic_vcf_index}" "~{basename(unfiltered_somatic_vcf_index) + }" + ln -sf "~{unfiltered_somatic_vcf_stats}" "~{basename(unfiltered_somatic_vcf_stats) + }" + + gatk --java-options "-Xmx~{24000}m" \ + FilterMutectCalls \ + -R "$ref_fasta.fa" \ + -V "~{basename(unfiltered_somatic_vcf)}" \ + --contamination-table "~{contamination_table}" \ + --tumor-segmentation "~{maf_segments}" \ + -O "~{prefix}.vcf.gz" + + rm -rf "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" "~{basename( + unfiltered_somatic_vcf + )}" "~{basename(unfiltered_somatic_vcf_index)}" "~{basename( + unfiltered_somatic_vcf_stats + )}" + >>> + + output { + File filtered_somatic_vcf = "~{prefix}.vcf.gz" + File filtered_somatic_vcf_index = "~{prefix}.vcf.gz.tbi" + } + + requirements { + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + cpu: 1 + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task calculate_contamination { + meta { + description: "Run GATK CalculateContamination to estimate cross-sample contamination in tumor sample" + outputs: { + contamination_table: "Table file with estimated contamination fraction and related metrics", + maf_segments: "Table file with segmented genomic intervals for MAF calculation", + } + } + + parameter_meta { + tumor_pileups: "Pileup summaries file for the tumor sample generated by GATK GetPileupSummaries" + normal_pileups: { + description: "Pileup summaries file for the normal sample generated by GATK GetPileupSummaries.", + help: "Optional but recommended for more accurate contamination estimation.", + } + prefix: "Prefix for output files. The extensions '.table' and '.segments.table' will be added." + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File tumor_pileups + File? normal_pileups + String prefix = basename(tumor_pileups, ".table") + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(tumor_pileups, "GB") * 2) + ceil(size(normal_pileups, "GB" + ) * 2) + 20 + modify_disk_size_gb + + command <<< + set -euo pipefail + + gatk --java-options "-Xmx~{24000}m" \ + CalculateContamination \ + -I "~{tumor_pileups}" \ + -O "~{prefix}.contamination.table" \ + --tumor-segmentation "~{prefix}.segments.table" \ + ~{if defined(normal_pileups) + then "-matched '" + normal_pileups + "'" + else "" + } + >>> + + output { + File contamination_table = "~{prefix}.contamination.table" + File maf_segments = "~{prefix}.segments.table" + } + + requirements { + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + memory: "25 GB" + disks: "~{disk_size_gb} GB" + maxRetries: 1 + cpu: 1 + } +} + +task get_pileup_summaries { + meta { + description: "Run GATK GetPileupSummaries to generate pileup summaries for contamination estimation" + outputs: { + pileup_summaries: "Table file with pileup summaries for the input BAM file over the specified variants and intervals", + } + } + + parameter_meta { + bam: "Input BAM file with aligned reads" + bam_index: "Index file for the input BAM file" + intervals: "One or more genomic intervals over which to operate. Often the same file as `variants`" + intervals_index: "Index file for the intervals file" + variants: "VCF file with variants and allele frequencies to summarize pileups over." + variants_index: "Index file for the variant VCF" + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + reference_fasta_dict: "Dictionary file for the reference genome FASTA" + prefix: "Prefix for output file. The extension '.table' will be added." + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File bam + File bam_index + File intervals + File intervals_index + File variants + File variants_index + File reference_fasta + File reference_fasta_index + File reference_fasta_dict + String prefix = basename(bam, ".bam") + "_pileup_summaries" + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(bam, "GB") * 2) + ceil(size(intervals, "GB") * 2) + ceil( + size(variants, "GB") * 2 + ) + ceil(size(reference_fasta, "GB") * 2) + 20 + modify_disk_size_gb + + String name = basename(bam, ".bam") + + command <<< + set -euo pipefail + + ln -sf "~{bam}" "~{name}.bam" + ln -sf "~{bam_index}" "~{name}.bam.bai" + ln -sf "~{intervals}" "~{basename(intervals)}" + ln -sf "~{intervals_index}" "~{basename(intervals_index)}" + ln -sf "~{variants}" "~{basename(variants)}" + ln -sf "~{variants_index}" "~{basename(variants_index)}" + ref_fasta=~{sub(basename(reference_fasta, ".gz"), ".(fasta|fa)?$", "")} + gunzip -c "~{reference_fasta}" > "$ref_fasta.fa" \ + || ln -sf "~{reference_fasta}" "$ref_fasta.fa" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fa.fai" + ln -sf "~{reference_fasta_dict}" "$ref_fasta.dict" + + gatk --java-options "-Xmx~{(task.memory / (1024 * 1024)) - 3000}m" \ + GetPileupSummaries \ + --disable-bam-index-caching \ + -I "~{name}.bam" \ + -V "~{basename(variants)}" \ + -L "~{basename(intervals)}" \ + -O "~{prefix}.table" + + rm -rf "~{name}.bam" "~{name}.bam.bai" "~{basename(intervals)}" "~{basename( + intervals_index + )}" "~{basename(variants)}" "~{basename(variants_index)}" "$ref_fasta.fa" "$ref_fasta.fa.fai" "$ref_fasta.dict" + >>> + + output { + File pileup_summaries = "~{prefix}.table" + } + + requirements { + container: "quay.io/biocontainers/gatk4:4.6.2.0--py310hdfd78af_1" + memory: "50 GB" + disks: "~{disk_size_gb} GB" + maxRetries: 1 + cpu: 1 + } +} diff --git a/tools/ngsderive.wdl b/tools/ngsderive.wdl index f7856be9a..534fa3ef4 100644 --- a/tools/ngsderive.wdl +++ b/tools/ngsderive.wdl @@ -47,7 +47,7 @@ task strandedness { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< set -euo pipefail @@ -120,7 +120,7 @@ task instrument { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< set -euo pipefail @@ -179,7 +179,7 @@ task read_length { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< set -euo pipefail @@ -236,7 +236,7 @@ task encoding { } Float files_size = size(ngs_files, "GB") - Int disk_size_gb = ceil(files_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(files_size) + 30 + modify_disk_size_gb command <<< ngsderive encoding --verbose \ @@ -304,7 +304,7 @@ task junction_annotation { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< set -euo pipefail @@ -403,7 +403,7 @@ task endedness { Int memory_gb = if calc_rpt then (ceil(bam_size * 2.5) + 4 + modify_memory_gb) else 4 - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< ngsderive endedness --verbose \ diff --git a/tools/ngsep.wdl b/tools/ngsep.wdl new file mode 100644 index 000000000..b20c4a05d --- /dev/null +++ b/tools/ngsep.wdl @@ -0,0 +1,57 @@ +version 1.2 + +task germline_variant { + meta { + description: "Call germline variants using NGSEP" + outputs: { + vcf_output: "VCF file containing called germline variants", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + bam: "Input BAM file with aligned reads" + output_prefix: "Prefix for the output file with called variants" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File bam + String output_prefix = "ngsep_germline_output" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(bam, "GB")) + 20 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + java -Xmx16g -jar /usr/local/bin/NGSEPcore.jar \ + SingleSampleVariantsDetector \ + -r "$ref_fasta" \ + -i "~{bam}" \ + -o "~{output_prefix}" + # -t "~{threads}" + + rm -rf "$ref_fasta" + >>> + + output { + Array[File] vcf_output = glob("~{output_prefix}*") + } + + requirements { + container: "ghcr.io/stjudecloud/ngsep:branch-minimap2-5.1.0-0" + cpu: threads + memory: "20 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/picard.wdl b/tools/picard.wdl index 870c566c2..ab9cf1d4e 100755 --- a/tools/picard.wdl +++ b/tools/picard.wdl @@ -83,9 +83,10 @@ task mark_duplicates { Float bam_size = size(bam, "GB") Int memory_gb = min(ceil(bam_size + 12), 50) + modify_memory_gb + Int disk_size_gb = (if create_bam - then ceil((bam_size * 2) + 10) - else ceil(bam_size + 10) + then ceil((bam_size * 2) + 30) + else ceil(bam_size + 30) ) + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) @@ -93,7 +94,10 @@ task mark_duplicates { command <<< set -euo pipefail + mkdir tmp + picard -Xmx~{java_heap_size}g MarkDuplicates \ + -Djava.io.tmpdir="$(pwd)/tmp" \ -I "~{bam}" \ --METRICS_FILE "~{prefix}.metrics.txt" \ -O "~{if create_bam @@ -117,6 +121,8 @@ task mark_duplicates { if ~{create_bam}; then mv "~{prefix}.bai" "~{prefix}.bam.bai" fi + + rm -rf ./tmp >>> output { @@ -202,7 +208,7 @@ task validate_bam { then "--MODE SUMMARY" else "" Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size * 4) + 50 + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) command <<< @@ -291,12 +297,13 @@ task sort { String sort_order = "coordinate" String prefix = basename(bam, ".bam") + ".sorted" String validation_stringency = "SILENT" - Int memory_gb = 25 + Int memory_gb = 35 Int modify_disk_size_gb = 0 } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size * 4) + 10 + modify_disk_size_gb + # Original BAM, sorted fragments (~2.5x original BAM size), final BAM + Int disk_size_gb = ceil(bam_size * 5) + 30 + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) String outfile_name = prefix + ".bam" @@ -304,10 +311,15 @@ task sort { command <<< set -euo pipefail - picard -Xmx~{java_heap_size}g SortSam \ + mkdir tmp + + picard -Xmx~{java_heap_size}g \ + -Djava.io.tmpdir="$(pwd)/tmp" \ + SortSam \ -I "~{bam}" \ -O "~{outfile_name}" \ -SO "~{sort_order}" \ + --TMP_DIR "$(pwd)/tmp" \ --CREATE_INDEX true \ --CREATE_MD5_FILE true \ --VALIDATION_STRINGENCY "~{validation_stringency}" @@ -317,6 +329,8 @@ task sort { if [ -f "~{prefix}.bai" ]; then mv "~{prefix}.bai" "~{outfile_name}.bai" fi + + rm -rf ./tmp >>> output { @@ -602,7 +616,7 @@ task collect_alignment_summary_metrics { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) command <<< @@ -736,7 +750,7 @@ task collect_insert_size_metrics { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) command <<< @@ -795,7 +809,7 @@ task quality_score_distribution { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb Int java_heap_size = ceil(memory_gb * 0.9) command <<< @@ -838,6 +852,7 @@ task merge_vcfs { input { Array[File] vcfs + #@ except: UnusedInput Array[File] vcfs_indexes String output_vcf_name Int modify_disk_size_gb = 0 @@ -961,7 +976,7 @@ task create_sequence_dictionary { String? assembly_name String? fasta_url String? species - String outfile_name = basename(fasta, ".fa") + ".dict" + String outfile_name = sub(basename(fasta), "\\.(fa|fasta|fna)(\\.gz)?$", "") + ".dict" Int memory_gb = 16 Int modify_disk_size_gb = 0 } diff --git a/tools/samtools.wdl b/tools/samtools.wdl index ace383ece..2387cb457 100755 --- a/tools/samtools.wdl +++ b/tools/samtools.wdl @@ -20,7 +20,7 @@ task quickcheck { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< samtools quickcheck "~{bam}" @@ -178,7 +178,7 @@ task flagstat { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb command <<< set -euo pipefail @@ -673,7 +673,7 @@ task addreplacerg { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size * 2) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size * 2) + 50 + modify_disk_size_gb String outfile_name = prefix + ".bam" @@ -709,7 +709,7 @@ task addreplacerg { runtime { cpu: ncpu - memory: "4 GB" + memory: "8 GB" disks: "~{disk_size_gb} GB" container: "quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0" maxRetries: 1 @@ -884,7 +884,7 @@ task bam_to_fastq { Int disk_size_gb = ceil(bam_size * if (retain_collated_bam && !collated && paired_end) then 5 else 2 - ) + 10 + modify_disk_size_gb + ) + 30 + modify_disk_size_gb command <<< set -euo pipefail @@ -1222,52 +1222,6 @@ task position_sorted_fixmate { } } -task faidx { - meta { - description: "Creates a `.fai` FASTA index for the input FASTA" - outputs: { - fasta_index: "A `.fai` FASTA index associated with the input FASTA. Filename will be `basename(fasta) + '.fai'`.", - } - } - - parameter_meta { - fasta: "Input FASTA format file to index. Optionally gzip compressed." - modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." - } - - input { - File fasta - Int modify_disk_size_gb = 0 - } - - Float fasta_size = size(fasta, "GB") - Int disk_size_gb = ceil(fasta_size * 2.5) + 10 + modify_disk_size_gb - - String outfile_name = basename(fasta, ".gz") + ".fai" - - command <<< - set -euo pipefail - - ref_fasta=~{basename(fasta, ".gz")} - gunzip -c "~{fasta}" > "$ref_fasta" \ - || ln -sf "~{fasta}" "$ref_fasta" - - samtools faidx -o "~{outfile_name}" "$ref_fasta" - >>> - - output { - File fasta_index = outfile_name - } - - runtime { - cpu: 1 - memory: "4 GB" - disks: "~{disk_size_gb} GB" - container: "quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0" - maxRetries: 1 - } -} - #@ except: MatchingOutputMeta task markdup { meta { @@ -1436,3 +1390,172 @@ task markdup { maxRetries: 1 } } + +task faidx { + meta { + description: "Creates a `.fai` FASTA index for the input FASTA" + outputs: { + fasta_index: "A `.fai` FASTA index associated with the input FASTA. Filename will be `basename(fasta) + '.fai'`.", + } + } + + parameter_meta { + fasta: "Input FASTA format file to index. Optionally gzip compressed." + modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." + } + + input { + File fasta + Int modify_disk_size_gb = 0 + } + + Float fasta_size = size(fasta, "GB") + Int disk_size_gb = ceil(fasta_size * 2.5) + 10 + modify_disk_size_gb + + String outfile_name = basename(fasta, ".gz") + ".fai" + + command <<< + set -euo pipefail + + ref_fasta=~{basename(fasta, ".gz")} + gunzip -c "~{fasta}" > "$ref_fasta" \ + || ln -sf "~{fasta}" "$ref_fasta" + + samtools faidx -o "~{outfile_name}" "$ref_fasta" + >>> + + output { + File fasta_index = outfile_name + } + + runtime { + cpu: 1 + memory: "4 GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0" + maxRetries: 1 + } +} + +task calmd { + meta { + description: "Calculates MD and NM tags" + outputs: { + calmd_bam: "A BAM file with the MD and NM tags calculated. Output filename is `basename(bam) + '.calmd.bam'`.", + } + } + + parameter_meta { + bam: "Input BAM format file to quickcheck" + reference_fasta: "Reference FASTA format file. Must be indexed with a `.fai` file." + prefix: "Prefix for the output file. The extension `.bam` will be added." + modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." + ncpu: "Number of cores to allocate for task" + } + + input { + File bam + File reference_fasta + String prefix = basename(bam, ".bam") + ".calmd" + Int modify_disk_size_gb = 0 + Int ncpu = 2 + } + + Float bam_size = size(bam, "GiB") + Int disk_size_gb = ceil(bam_size * 2.5) + ceil(size(reference_fasta, "GiB") * 2) + 30 + + modify_disk_size_gb + + command <<< + set -euo pipefail + + (( n_cores = ~{ncpu} - 1 || 1 )) + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + samtools calmd \ + --threads "$n_cores" \ + -b \ + "~{bam}" \ + "$ref_fasta" \ + > "~{prefix}.bam" + >>> + + output { + File calmd_bam = "~{prefix}.bam" + } + + runtime { + cpu: ncpu + memory: "4 GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0" + maxRetries: 2 + } +} + +task sort { + meta { + description: "Sorts the input BAM file using `samtools sort`." + outputs: { + sorted_bam: "A sorted BAM file", + } + } + + parameter_meta { + bam: "Input BAM format file to sort" + prefix: "Prefix for the output file. The extension `.bam` will be added." + use_all_cores: { + description: "Use all cores? Recommended for cloud environments.", + group: "Common", + } + ncpu: { + description: "Number of cores to allocate for task", + group: "Common", + } + modify_memory_gb: "Add to or subtract from dynamic memory allocation. Default memory is determined by the size of the inputs. Specified in GB." + modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB." + } + + input { + File bam + String prefix = basename(bam, ".bam") + ".sorted" + Boolean use_all_cores = false + Int ncpu = 4 + Int modify_memory_gb = 0 + Int modify_disk_size_gb = 0 + } + + # Original BAM, intermediate BAM chunks (~1.5x), final BAM, hence 4x + Int disk_size_gb = ceil(size(bam, "GB") * 5) + 10 + modify_disk_size_gb + Int memory_gb = ncpu + 4 + modify_memory_gb + + command <<< + set -euo pipefail + + n_cores=~{ncpu} + if ~{use_all_cores}; then + n_cores=$(nproc) + fi + # -1 because samtools uses one more core than `--threads` specifies + (( n_cores -= 1 )) + + samtools sort \ + --threads "$n_cores" \ + -o "~{prefix}.bam" \ + "~{bam}" + >>> + + output { + File sorted_bam = "~{prefix}.bam" + } + + runtime { + cpu: ncpu + memory: "~{memory_gb} GB" + disks: "~{disk_size_gb} GB" + container: "quay.io/biocontainers/samtools:1.19.2--h50ea8bc_0" + maxRetries: 1 + } +} diff --git a/tools/strelka.wdl b/tools/strelka.wdl new file mode 100644 index 000000000..81ccc8f6f --- /dev/null +++ b/tools/strelka.wdl @@ -0,0 +1,193 @@ +version 1.2 + +task somatic { + meta { + description: "Run Strelka somatic variant calling workflow" + outputs: { + strelka_output: "Directory containing Strelka somatic variant calls", + somatic_indels: "VCF file with somatic indels", + somatic_indels_index: "Index file for the somatic indels VCF", + somatic_snvs: "VCF file with somatic SNVs", + somatic_snvs_index: "Index file for the somatic SNVs VCF", + log_file: "Log file from the Strelka workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + normal_bam: "Input BAM file with aligned reads for normal sample" + normal_bam_index: "Index file for the normal BAM file" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + tumor_bam_index: "Index file for the tumor BAM file" + indel_candidates: "Optional VCF file with candidate indels, recommended to be generated by Manta" + indel_candidates_index: "Index file for the indel candidates VCF" + output_dir: "Directory to store Strelka output" + exome: "Boolean indicating if the data is exome sequencing" + rna: "Boolean indicating if the data is RNA sequencing" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File normal_bam + File normal_bam_index + File tumor_bam + File tumor_bam_index + File? indel_candidates + #@ except: UnusedInput + File? indel_candidates_index + String output_dir = "strelka_somatic_output" + Boolean exome = false + Boolean rna = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(normal_bam, "GB") + * 2) + ceil(size(tumor_bam, "GB") * 2) + (if defined(indel_candidates) + then ceil(size(indel_candidates, "GB")) + else 0 + ) + 20 + modify_disk_size_gb + + String tumor = basename(tumor_bam) + String normal = basename(normal_bam) + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + ln -sf "~{tumor_bam}" "~{tumor}" + ln -sf "~{tumor_bam_index}" "~{tumor}.bai" + ln -sf "~{normal_bam}" "~{normal}" + ln -sf "~{normal_bam_index}" "~{normal}.bai" + + configureStrelkaSomaticWorkflow.py \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" \ + --tumorBam "~{tumor}" \ + --normalBam "~{normal}" \ + ~{if (exome) + then "--exome" + else "" + } \ + ~{if (rna) + then "--rna" + else "" + } \ + ~{(if (defined(indel_candidates)) + then "--indelCandidates '~{indel_candidates}'" + else "" + )} + + + "~{output_dir}/runWorkflow.py" -m local -j ~{threads} + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{tumor}" "~{tumor}.bai" "~{normal}" "~{ + normal + }.bai" + >>> + + output { + Directory strelka_output = output_dir + File somatic_indels = "~{output_dir}/results/variants/somatic.indels.vcf.gz" + File somatic_indels_index = "~{output_dir}/results/variants/somatic.indels.vcf.gz.tbi" + File somatic_snvs = "~{output_dir}/results/variants/somatic.snvs.vcf.gz" + File somatic_snvs_index = "~{output_dir}/results/variants/somatic.snvs.vcf.gz.tbi" + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/strelka:2.9.10--h9ee0642_1" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} + +task germline { + meta { + description: "Run Strelka germline variant calling workflow" + outputs: { + strelka_output: "Directory containing Strelka germline variant calls", + log_file: "Log file from the Strelka workflow execution", + } + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + bam: "Input BAM file with aligned reads" + bam_index: "Index for input BAM file" + output_dir: "Directory to store Strelka output" + exome: "Boolean indicating if the data is exome sequencing" + rna: "Boolean indicating if the data is RNA sequencing" + threads: "Number of threads to use" + modify_disk_size_gb: "Additional disk size in GB to allocate" + } + + input { + File reference_fasta + File reference_fasta_index + File bam + File bam_index + String output_dir = "strelka_germline_output" + Boolean exome = false + Boolean rna = false + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil(size(reference_fasta, "GB") * 2) + ceil(size(bam, "GB") * 2) + 20 + + modify_disk_size_gb + + String filename = basename(bam) + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + ln -sf "~{reference_fasta_index}" "$ref_fasta.fai" + + ln -sf "~{bam}" "~{filename}" + ln -sf "~{bam_index}" "~{filename}.bai" + + configureStrelkaGermlineWorkflow.py \ + --referenceFasta "$ref_fasta" \ + --runDir "~{output_dir}" \ + --bam "~{filename}" \ + ~{if (exome) + then "--exome" + else "" + } \ + ~{if (rna) + then "--rna" + else "" + } + + "~{output_dir}/runWorkflow.py" -m local -j ~{threads} + + rm -rf "$ref_fasta" "$ref_fasta.fai" "~{filename}" "~{filename}.bai" "~{output_dir + }/workspace/pyflow.data/logs/tmp" + >>> + + output { + Directory strelka_output = output_dir + File log_file = "~{output_dir}/workspace/pyflow.data/logs/pyflow_log.txt" + } + + requirements { + container: "quay.io/biocontainers/strelka:2.9.10--h9ee0642_1" + cpu: threads + memory: "25 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/tools/test/bwamem2.yaml b/tools/test/bwamem2.yaml new file mode 100644 index 000000000..717831382 --- /dev/null +++ b/tools/test/bwamem2.yaml @@ -0,0 +1,24 @@ +align: + - name: works + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.bwamem2_db.tar.gz +index: + - name: works + tags: [ reference, slow ] + inputs: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa \ No newline at end of file diff --git a/tools/test/clair.yaml b/tools/test/clair.yaml new file mode 100644 index 000000000..cc4cfc5bb --- /dev/null +++ b/tools/test/clair.yaml @@ -0,0 +1,187 @@ +# The `clair3` task requires a `Directory model` input containing pre-trained neural network +# weights. The `hkubal/clair3:v2.0.0` container ships built-in models at `/opt/models/`. +# Tests use the bundled Illumina model at `ilmn`. +clair3: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model: + - ilmn + + - name: with_vcf_candidates + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model: + - ilmn + vcf_candidates: + - vcfs/testY.vcf.gz + contigs: + - chrY + + - name: gvcf_mode + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model: + - ilmn + gvcf: + - true + + - name: all_contigs + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model: + - ilmn + all_contigs: + - true + + - name: print_ref_calls + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model: + - ilmn + print_ref_calls: + - true + +clairs: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + platform: + - ilmn + + - name: with_vcf_candidates + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + platform: + - ilmn + vcf_candidates: + - vcfs/testY.vcf.gz + contigs: + - chrY + - chrM + + - name: all_contigs + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + platform: + - ilmn + all_contigs: + - true + + - name: print_germline_calls + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + platform: + - ilmn + print_germline_calls: + - true diff --git a/tools/test/deepvariant.yaml b/tools/test/deepvariant.yaml new file mode 100644 index 000000000..fa1e4e60c --- /dev/null +++ b/tools/test/deepvariant.yaml @@ -0,0 +1,177 @@ +deepvariant: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + threads: + - 2 + + - name: wes_model + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model_type: + - WES + threads: + - 2 + + - name: runtime_report + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + runtime_report: + - true + threads: + - 2 + + - name: vcf_stats_report + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + vcf_stats_report: + - true + threads: + - 2 + + - name: custom_haploid_chromosomes + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + haploid_chromosomes: + - ["chrY"] + threads: + - 2 + +deepsomatic: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model_type: + - WGS + + - name: wes_model + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model_type: + - WES + + - name: runtime_report + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model_type: + - WGS + runtime_report: + - true + + - name: vcf_stats_report + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + model_type: + - WGS + vcf_stats_report: + - true diff --git a/tools/test/gatk4.yaml b/tools/test/gatk4.yaml index bfb256ae6..1a429b2e9 100644 --- a/tools/test/gatk4.yaml +++ b/tools/test/gatk4.yaml @@ -9,6 +9,8 @@ apply_bqsr: - bams/test_rnaseq_variant.bam.bai recalibration_report: - test_rnaseq_variant.recal.txt + ncpu: + - 1 base_recalibrator: - name: works inputs: @@ -26,14 +28,16 @@ base_recalibrator: - reference/GRCh38.chr1_chr19.dict $dbsnp: dbSNP_vcf: - - reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf + - reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz dbSNP_vcf_index: - - reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.idx + - reference/Homo_sapiens_assembly38.dbsnp138.top5000.vcf.gz.tbi $known_indels: known_indels_sites_vcfs: - [ reference/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz ] known_indels_sites_indices: - [ reference/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi ] + ncpu: + - 1 haplotype_caller: - name: works tags: [ slow ] @@ -98,4 +102,6 @@ mark_duplicates_spark: - bams/test.bwa_aln_pe.chrY_chrM.bam - bams/Aligned.sortedByCoord.chr9_chr22.bam - bams/test_rnaseq_variant.bam - - bams/test.bam \ No newline at end of file + - bams/test.bam + ncpu: + - 1 \ No newline at end of file diff --git a/tools/test/manta.yaml b/tools/test/manta.yaml new file mode 100644 index 000000000..282fdb84f --- /dev/null +++ b/tools/test/manta.yaml @@ -0,0 +1,71 @@ +manta_germline: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + + - name: exome_mode + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + exome: + - true + +manta_somatic: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + + - name: exome_mode + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + exome: + - true diff --git a/tools/test/minimap2.yaml b/tools/test/minimap2.yaml new file mode 100644 index 000000000..5ffe7ed73 --- /dev/null +++ b/tools/test/minimap2.yaml @@ -0,0 +1,130 @@ +align: + - name: works + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.mmi + - name: fasta_reference + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.fa + - name: fasta_reference_paf + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.fa + output_paf: + - true + cigar_in_paf: + - true + - false + - name: fasta_reference_flags + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.fa + ignore_base_quality: + - true + - false + output_md_tag: + - true + - false + soft_clip: + - true + - false + secondary_alignments: + - true + - false + # eqx (-X) is incompatible with --secondary=no in minimap2, so it is tested + # separately with secondary_alignments at its default (true) + - name: fasta_reference_eqx + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.fa + eqx: + - true + - false + - name: fasta_reference_eqx_no_secondary_fails + inputs: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + read_group: + - "@RG\\tID:test\\tSM:test" + reference_index: + - reference/GRCh38.chrY_chrM.fa + eqx: + - true + secondary_alignments: + - false + assertions: + exit_code: 1 + stderr: + - "-X/-P and --secondary=no can't be applied at the same time" +index: + - name: works + tags: [ reference, slow ] + inputs: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa \ No newline at end of file diff --git a/tools/test/mutect2.yaml b/tools/test/mutect2.yaml new file mode 100644 index 000000000..d78e29fd4 --- /dev/null +++ b/tools/test/mutect2.yaml @@ -0,0 +1,115 @@ +mutect2: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + reference_fasta_dict: + - reference/GRCh38.chrY_chrM.dict + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + tumor_sample_name: + - tumor + normal_sample_name: + - test + + - name: with_germline_resource + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + reference_fasta_dict: + - reference/GRCh38.chrY_chrM.dict + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $germline: + germline_resource_vcf: + - mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz + germline_resource_vcf_index: + - mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi + tumor_sample_name: + - tumor + normal_sample_name: + - test + +get_pileup_summaries: + - name: works + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + reference_fasta_dict: + - reference/GRCh38.chrY_chrM.dict + $sample: + bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $sites: + intervals: + - mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz + intervals_index: + - mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi + variants: + - mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz + variants_index: + - mutect2/af-only-gnomad.hg38.chrY_chrM.vcf.gz.tbi + +calculate_contamination: + - name: tumor_only + inputs: + tumor_pileups: + - mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.table + + - name: with_matched_normal + inputs: + tumor_pileups: + - mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.table + normal_pileups: + - mutect2/test.bwa_aln_pe.chrY_chrM_pileup_summaries.table + +filter_mutect: + - name: works + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + reference_fasta_dict: + - reference/GRCh38.chrY_chrM.dict + $unfiltered: + unfiltered_somatic_vcf: + - vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz + unfiltered_somatic_vcf_index: + - vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.tbi + unfiltered_somatic_vcf_stats: + - vcfs/test.bwa_aln_pe.chrY_chrM.mutect2.vcf.gz.stats + contamination_table: + - mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.contamination.table + maf_segments: + - mutect2/test.bwa_aln_pe.with_variants.chrY_chrM_pileup_summaries.segments.table diff --git a/tools/test/ngsep.yaml b/tools/test/ngsep.yaml new file mode 100644 index 000000000..d8da25fd6 --- /dev/null +++ b/tools/test/ngsep.yaml @@ -0,0 +1,11 @@ +germline_variant: + - name: works + inputs: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + threads: + - 1 + requirements.memory: + - "4 GB" diff --git a/tools/test/picard.yaml b/tools/test/picard.yaml index 5d6baa6cc..1e9e05971 100644 --- a/tools/test/picard.yaml +++ b/tools/test/picard.yaml @@ -166,7 +166,65 @@ clean_sam: - SILENT memory_gb: - 5 -# TODO: all the `collect*` QC tasks -# TODO: merge_vcfs -# TODO: scatter_interval_list -# TODO: create_sequence_dictionary +collect_wgs_metrics: + - name: works + inputs: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + +collect_alignment_summary_metrics: + - name: works + inputs: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + +collect_gc_bias_metrics: + - name: works + inputs: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + +collect_insert_size_metrics: + - name: works + inputs: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + +quality_score_distribution: + - name: works + inputs: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + +merge_vcfs: + - name: works + inputs: + $vcfs: + vcfs: + - [vcfs/test1.vcf.gz, vcfs/test2.vcf.gz] + vcfs_indexes: + - [vcfs/test1.vcf.gz.tbi, vcfs/test2.vcf.gz.tbi] + output_vcf_name: + - merged.vcf.gz + +scatter_interval_list: + - name: works + inputs: + interval_list: + - chr1_chr19.interval_list + scatter_count: + - 2 + +create_sequence_dictionary: + - name: works + inputs: + fasta: + - reference/GRCh38.chrY_chrM.fa + - name: gzipped_fasta + inputs: + fasta: + - reference/GRCh38.chr9_chr22.fa.gz diff --git a/tools/test/strelka.yaml b/tools/test/strelka.yaml new file mode 100644 index 000000000..40541c323 --- /dev/null +++ b/tools/test/strelka.yaml @@ -0,0 +1,95 @@ +somatic: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + + - name: with_indel_candidates + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + $indels: + indel_candidates: + - vcfs/testY.vcf.gz + indel_candidates_index: + - vcfs/testY.vcf.gz.tbi + + - name: exome_mode + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $normal: + normal_bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + normal_bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + $tumor: + tumor_bam: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam + tumor_bam_index: + - bams/test.bwa_aln_pe.with_variants.chrY_chrM.bam.bai + exome: + - true + +germline: + - name: works + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + + - name: exome_mode + tags: [slow] + inputs: + $reference: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa + reference_fasta_index: + - reference/GRCh38.chrY_chrM.fa.fai + $sample: + bam: + - bams/test.bwa_aln_pe.chrY_chrM.bam + bam_index: + - bams/test.bwa_aln_pe.chrY_chrM.bam.bai + exome: + - true diff --git a/tools/test/vg.yaml b/tools/test/vg.yaml new file mode 100644 index 000000000..d1b045160 --- /dev/null +++ b/tools/test/vg.yaml @@ -0,0 +1,90 @@ +giraffe: + - name: works + inputs: + $samples: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + - fastqs/test_R2.fq.gz + - fastqs/random10k.r1.fq.gz + - fastqs/random10k.r2.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + - null + - fastqs/random10k.r2.fq.gz + - null + gbz_graph: + - reference/GRCh38.chrY_chrM.giraffe.gbz + minimizer_index: + - reference/GRCh38.chrY_chrM.shortread.withzip.min + distance_index: + - reference/GRCh38.chrY_chrM.dist + zipcode_name: + - reference/GRCh38.chrY_chrM.shortread.zipcodes + - name: output_formats + inputs: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + gbz_graph: + - reference/GRCh38.chrY_chrM.giraffe.gbz + minimizer_index: + - reference/GRCh38.chrY_chrM.shortread.withzip.min + distance_index: + - reference/GRCh38.chrY_chrM.dist + zipcode_name: + - reference/GRCh38.chrY_chrM.shortread.zipcodes + output_format: + - gam + - gaf + - json + - tsv + - SAM + - BAM + - CRAM + output_name: + - aligned.out + # chaining-based presets (chaining-sr, hifi, r10, srold) do not support + # paired-end reads; test all presets unpaired and the non-chaining presets paired + - name: presets_unpaired + inputs: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + gbz_graph: + - reference/GRCh38.chrY_chrM.giraffe.gbz + minimizer_index: + - reference/GRCh38.chrY_chrM.shortread.withzip.min + distance_index: + - reference/GRCh38.chrY_chrM.dist + zipcode_name: + - reference/GRCh38.chrY_chrM.shortread.zipcodes + preset: + - chaining-sr + - default + - fast + - hifi + - r10 + - srold + - name: presets_paired + inputs: + read_one_fastq_gz: + - fastqs/test_R1.fq.gz + read_two_fastq_gz: + - fastqs/test_R2.fq.gz + gbz_graph: + - reference/GRCh38.chrY_chrM.giraffe.gbz + minimizer_index: + - reference/GRCh38.chrY_chrM.shortread.withzip.min + distance_index: + - reference/GRCh38.chrY_chrM.dist + zipcode_name: + - reference/GRCh38.chrY_chrM.shortread.zipcodes + preset: + - default + - fast +index: + - name: works + tags: [ reference, slow ] + inputs: + reference_fasta: + - reference/GRCh38.chrY_chrM.fa diff --git a/tools/util.wdl b/tools/util.wdl index 319d72ed9..fdf4f1c31 100644 --- a/tools/util.wdl +++ b/tools/util.wdl @@ -148,7 +148,7 @@ task compression_integrity { } Float file_size = size(bgzipped_file, "GB") - Int disk_size_gb = ceil(file_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(file_size) + 30 + modify_disk_size_gb command <<< bgzip -t "~{bgzipped_file}" @@ -303,6 +303,7 @@ task make_coverage_regions_bed { } runtime { + memory: "8 GB" disks: "~{disk_size_gb} GB" container: "quay.io/biocontainers/bedops:2.4.41--h9f5acd7_0" maxRetries: 1 @@ -332,7 +333,7 @@ task global_phred_scores { } Float bam_size = size(bam, "GB") - Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb + Int disk_size_gb = ceil(bam_size) + 30 + modify_disk_size_gb String outfile_name = prefix + ".global_PHRED_scores.tsv" diff --git a/tools/vg.wdl b/tools/vg.wdl new file mode 100644 index 000000000..4475945ec --- /dev/null +++ b/tools/vg.wdl @@ -0,0 +1,195 @@ +version 1.2 + +task giraffe { + meta { + description: "Align DNA sequences against a variation graph using vg giraffe" + outputs: { + alignments: "The output alignment file in GAM format", + } + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align with vg giraffe" + gbz_graph: "The vg GBZ graph file for the reference genome" + minimizer_index: "The vg minimizer index file for the reference genome" + zipcode_name: "The vg zipcode name file for the reference genome" + distance_index: "The vg distance index file for the reference genome" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align with vg giraffe" + haploytype: "The haplotype information file" + kff: "The KFF file containing kmer counts" + sample_name: "The sample name to include" + read_group: "The read group" + output_name: "The name of the output alignment file" + output_format: { + description: "The output format for alignments", + options: [ + "gam", + "gaf", + "json", + "tsv", + "SAM", + "BAM", + "CRAM", + ], + } + preset: { + description: "vg giraffe preset for alignment", + options: [ + "chaining-sr", + "default", + "fast", + "hifi", + "r10", + "srold", + ], + } + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File gbz_graph + File minimizer_index + File zipcode_name + File distance_index + File? read_two_fastq_gz + File? haploytype + File? kff + String? sample_name + String? read_group + String output_name = "aligned.bam" + String output_format = "BAM" + String preset = "default" + Int threads = 4 + Int modify_disk_size_gb = 0 + } + + Int disk_size_gb = ceil((size(read_one_fastq_gz, "GB") + size(read_two_fastq_gz, "GB") + ) * 2) + ceil(size(gbz_graph, "GB")) + ceil(size(minimizer_index, "GB")) + ceil(size( + distance_index, "GB" + )) + ceil(size(zipcode_name, "GB")) + 10 + modify_disk_size_gb + + command <<< + vg giraffe \ + -t ~{threads} \ + -Z "~{gbz_graph}" \ + -m "~{minimizer_index}" \ + -d "~{distance_index}" \ + -z "~{zipcode_name}" \ + -f "~{read_one_fastq_gz}" \ + ~{if defined(read_two_fastq_gz) + then "-f \"~{read_two_fastq_gz}\"" + else "" + } \ + -o "~{output_format}" \ + ~{if defined(sample_name) + then "--sample \"~{sample_name}\"" + else "" + } \ + ~{if defined(read_group) + then "--read-group \"~{read_group}\"" + else "" + } \ + ~{if defined(haploytype) + then "--haplotype-name \"~{haploytype}\"" + else "" + } \ + ~{if defined(kff) + then "--kff-name \"~{kff}\"" + else "" + } \ + --parameter-preset "~{preset}" \ + > "~{output_name}" + >>> + + output { + File alignments = "~{output_name}" + } + + requirements { + container: "quay.io/biocontainers/vg:1.70.0--h9ee0642_0" + cpu: threads + memory: "60 GB" + disks: "~{disk_size_gb} GB" + } +} + +task index { + meta { + description: "Index a reference genome for alignment with vg giraffe" + outputs: { + reference_index: "The vg giraffe index file for the reference genome", + } + } + + parameter_meta { + reference_fasta: "The reference genome in FASTA format to be indexed" + vcf_files: "VCF(s) containing variants to augment the graph" + transcript_gff: "GFF(s) containing transcript annotations" + db_prefix: "The base name for the output index files" + gff_feature: "The feature type in the GFF to use for transcripts" + gff_id_tag: "The attribute tag in the GFF to use as transcript ID" + autoindex_workflow: { + description: "The vg autoindex workflow to use", + choices: [ + "map", + "mpmap", + "rpvg", + "giraffe", + "sr-giraffe", + "lr-giraffe", + ], + } + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + threads: "Number of threads to use for indexing" + } + + input { + File reference_fasta + Array[File] vcf_files = [] + Array[File] transcript_gff = [] + String db_prefix = "reference" + String gff_feature = "exon" + String gff_id_tag = "transcript_id" + String autoindex_workflow = "giraffe" + Int modify_disk_size_gb = 0 + Int threads = 4 + } + + Float input_fasta_size = size(reference_fasta, "GB") + Float vcf_size = size(vcf_files, "GB") + Float transcript_gff_size = size(transcript_gff, "GB") + Int disk_size_gb = ceil(input_fasta_size * 2) + ceil(vcf_size * 2) + ceil( + transcript_gff_size * 2 + ) + 10 + modify_disk_size_gb + + command <<< + set -euo pipefail + + ref_fasta=~{basename(reference_fasta, ".gz")} + gunzip -c "~{reference_fasta}" > "$ref_fasta" \ + || ln -sf "~{reference_fasta}" "$ref_fasta" + + vg autoindex \ + --workflow "~{autoindex_workflow}" \ + -r "$ref_fasta" \ + -p "~{db_prefix}" \ + ~{sep(" ", prefix("-v ", quote(vcf_files)))} \ + ~{sep(" ", prefix("-x ", quote(transcript_gff)))} \ + -t ~{threads} \ + --gff-feature "~{gff_feature}" \ + --gff-tx-tag "~{gff_id_tag}" + >>> + + output { + Array[File] reference_index = glob("~{db_prefix}*") + } + + requirements { + container: "quay.io/biocontainers/vg:1.70.0--h9ee0642_0" + cpu: threads + memory: "120 GB" + disks: "~{disk_size_gb} GB" + } +} diff --git a/workflows/evaluation/dnaseq-alignment.wdl b/workflows/evaluation/dnaseq-alignment.wdl new file mode 100644 index 000000000..f590118aa --- /dev/null +++ b/workflows/evaluation/dnaseq-alignment.wdl @@ -0,0 +1,149 @@ +version 1.2 + +import "../../data_structures/read_group.wdl" as read_group_ds +import "../../tools/bwa.wdl" as bwa +import "../../tools/bwamem2.wdl" as bwamem2 +import "../../tools/minimap2.wdl" as minimap2 +import "../../tools/samtools.wdl" as samtools +import "../../tools/vg.wdl" as vg +import "../general/alignment-post.wdl" as alignment_post + +workflow align { + meta { + name: "Aligner comparison workflow" + description: "Align sequencing reads to a reference genome with a variety of aligners" + category: "Utility" + outputs: { + bwamem2_bam: "BAM file output from BWA-MEM2 alignment", + bwamem2_bam_index: "BAM index file for BWA-MEM2 alignments", + vg_giraffe_bam: "BAM file output from vg giraffe alignment", + vg_giraffe_bam_index: "BAM index file for vg giraffe alignments", + minimap2_bam: "BAM file output from minimap2 alignment", + minimap2_bam_index: "BAM index file for minimap2 alignments", + bwa_bam: "BAM file output from BWA alignment", + bwa_bam_index: "BAM index file for BWA alignments", + } + allowNestedInputs: true + } + + parameter_meta { + read_one_fastq_gz: "Input gzipped FASTQ read one file to align" + bwamem2_reference: "The BWA-MEM2 index file for the reference genome" + giraffe_gbz_graph: "The vg GBZ graph file for the reference genome" + giraffe_minimizer_index: "The vg minimizer index file for the reference genome" + giraffe_zipcode_name: "The vg zipcode name file for the reference genome" + giraffe_distance_index: "The vg distance index file for the reference genome" + minimap2_reference: "The minimap2 index file for the reference genome" + bwamem_reference: "The BWA index tar.gz file for the reference genome" + read_group: "The read group to be included in the SAM header" + read_two_fastq_gz: "Input gzipped FASTQ read two file to align (for paired-end data)" + output_prefix: "Output file prefix for aligned reads" + threads: "Number of threads to use for alignment" + modify_disk_size_gb: "Additional disk space to allocate (in GB)" + } + + input { + File read_one_fastq_gz + File bwamem2_reference + File giraffe_gbz_graph + File giraffe_minimizer_index + File giraffe_zipcode_name + File giraffe_distance_index + File minimap2_reference + File bwamem_reference + ReadGroup read_group + File? read_two_fastq_gz + String output_prefix = "aligned_output" + Int threads = 30 + Int modify_disk_size_gb = 0 + } + + call read_group_ds.read_group_to_string as read_group_string { + read_group, + format_as_sam_record = true, + } + + call bwamem2.align as bwamem2 { + read_one_fastq_gz, + read_two_fastq_gz, + reference_index = bwamem2_reference, + prefix = "~{output_prefix}_bwamem2", + threads, + modify_disk_size_gb, + read_group = read_group_string.validated_read_group, + } + + call alignment_post.alignment_post as bwamem2_post { + bam = bwamem2.alignments, + mark_duplicates = true, + } + + call vg.giraffe as vg_giraffe { + read_one_fastq_gz, + read_two_fastq_gz, + gbz_graph = giraffe_gbz_graph, + minimizer_index = giraffe_minimizer_index, + zipcode_name = giraffe_zipcode_name, + distance_index = giraffe_distance_index, + output_name = "~{output_prefix}_vg.bam", + output_format = "BAM", + threads, + modify_disk_size_gb, + } + + call read_group_ds.read_group_to_array { + read_group, + } + + call samtools.addreplacerg { + bam = vg_giraffe.alignments, + orphan_only = false, + read_group_line = read_group_to_array.converted_read_group, + } + + call alignment_post.alignment_post as giraffe_post { + bam = addreplacerg.tagged_bam, + mark_duplicates = true, + } + + call minimap2.align as minimap2 { + read_one_fastq_gz, + read_two_fastq_gz, + reference_index = minimap2_reference, + output_name = "~{output_prefix}_minimap2.bam", + threads, + modify_disk_size_gb, + read_group = read_group_string.validated_read_group, + } + + call alignment_post.alignment_post as minimap2_post { + bam = minimap2.alignments, + mark_duplicates = true, + } + + call bwa.bwa_mem as bwa_mem { + read_one_fastq_gz, + read_two_fastq_gz, + bwa_db_tar_gz = bwamem_reference, + prefix = "~{output_prefix}_bwa", + ncpu = threads, + modify_disk_size_gb, + read_group = read_group_string.validated_read_group, + } + + call alignment_post.alignment_post as bwa_mem_post { + bam = bwa_mem.bam, + mark_duplicates = true, + } + + output { + File bwamem2_bam = bwamem2_post.processed_bam + File bwamem2_bam_index = bwamem2_post.bam_index + File vg_giraffe_bam = giraffe_post.processed_bam + File vg_giraffe_bam_index = giraffe_post.bam_index + File minimap2_bam = minimap2_post.processed_bam + File minimap2_bam_index = minimap2_post.bam_index + File bwa_bam = bwa_mem_post.processed_bam + File bwa_bam_index = bwa_mem_post.bam_index + } +} diff --git a/workflows/evaluation/germline-variant-calling.wdl b/workflows/evaluation/germline-variant-calling.wdl new file mode 100644 index 000000000..e848a932e --- /dev/null +++ b/workflows/evaluation/germline-variant-calling.wdl @@ -0,0 +1,156 @@ +version 1.3 + +import "../../tools/clair.wdl" +import "../../tools/deepvariant.wdl" +import "../../tools/gatk4.wdl" +import "../../tools/manta.wdl" +import "../../tools/ngsep.wdl" +import "../../tools/strelka.wdl" + +workflow variant_calling { + meta { + description: "Runs a series of variant calling tools on a processed BAM file." + help: "The workflow is designed to be flexible and allow users to choose which variant callers to run based on their specific needs and preferences." + outputs: { + clair_vcf: "VCF file produced by Clair", + clair_full_vcf: "VCF file produced by Clair containing all variants before filtering", + clair_merged_vcf: "VCF file produced by Clair after merging pileup and full-alignment calls", + deepvariant_vcf: "VCF file produced by DeepVariant", + deepvariant_gvcf: "gVCF file produced by DeepVariant", + deepvariant_runtime_report: "Optional HTML report of runtime metrics from DeepVariant", + deepvariant_vcf_stats: "Optional HTML report of VCF statistics from DeepVariant", + manta_output: "Directory containing Manta structural variant calls and associated files", + manta_log: "Log file from the Manta workflow execution", + ngsep_vcf: "VCF file produced by NGSEP", + strelka_output: "Directory containing Strelka somatic variant calls and associated files", + strelka_log: "Log file from the Strelka workflow execution", + haplotype_caller_vcf: "VCF file output by GATK HaplotypeCaller after VQSR and genotype posterior calculation", + haplotype_caller_vcf_index: "Index file for the HaplotypeCaller VCF output", + } + allowNestedInputs: true + } + + parameter_meta { + processed_bam: "Input BAM format file that has been processed and is ready for variant calling" + processed_bam_index: "BAI index file associated with the input BAM file" + reference_genome: "Reference genome in FASTA format that corresponds to the alignments in the input BAM file" + reference_genome_index: "Index file for the reference genome FASTA, typically with a .fai extension" + reference_genome_dictionary: "Dictionary file for FASTA format genome" + dbSNP_vcf: "dbSNP VCF file" + dbSNP_vcf_index: "dbSNP VCF index file" + interval_list: { + description: "Interval list indicating regions in which to call variants", + external_help: "https://gatk.broadinstitute.org/hc/en-us/articles/360035531852-Intervals-and-interval-lists", + } + clair3_model: "Pre-trained Clair3 model file to use for variant calling with Clair" + known_indels_sites_vcfs: "Optional array of VCF files containing known indel sites to be used in variant calling and filtering" + known_indels_sites_indices: "Optional array of index files corresponding to the known indel site VCF files" + resources: "Optional array of additional resource files to be used in variant calling and filtering, such as additional VCFs or BED files" + run_clair: "Whether to run Clair for variant calling" + run_deepvariant: "Whether to run DeepVariant for variant calling" + run_haplotype_caller: "Whether to run GATK's Haplotype Caller for variant calling" + run_manta: "Whether to run Manta for structural variant calling" + run_ngsep: "Whether to run NGSEP for variant calling" + run_strelka: "Whether to run Strelka for variant calling" + } + + input { + File processed_bam + File processed_bam_index + File reference_genome + File reference_genome_index + File reference_genome_dictionary + #@ except: SnakeCase + File dbSNP_vcf + #@ except: SnakeCase + File dbSNP_vcf_index + File interval_list + Directory clair3_model + Array[File] known_indels_sites_vcfs = [] + Array[File] known_indels_sites_indices = [] + Array[Resource] resources = [] + Boolean run_clair = true + Boolean run_deepvariant = true + Boolean run_haplotype_caller = true + Boolean run_manta = true + Boolean run_ngsep = true + Boolean run_strelka = true + } + + if (run_clair) { + call clair.clair3 { + bam = processed_bam, + bam_index = processed_bam_index, + reference_fasta = reference_genome, + reference_fasta_index = reference_genome_index, + model = clair3_model, + } + } + + if (run_deepvariant) { + call deepvariant.deepvariant { + bam = processed_bam, + bam_index = processed_bam_index, + reference_fasta = reference_genome, + reference_fasta_index = reference_genome_index, + } + } + + if (run_manta) { + call manta.manta_germline { + bam = processed_bam, + bam_index = processed_bam_index, + reference_fasta = reference_genome, + reference_fasta_index = reference_genome_index, + } + } + + if (run_ngsep) { + call ngsep.germline_variant as ngsep_germline_variant { + bam = processed_bam, + reference_fasta = reference_genome, + } + } + + if (run_strelka) { + call strelka.germline as strelka_germline { + bam = processed_bam, + bam_index = processed_bam_index, + reference_fasta = reference_genome, + reference_fasta_index = reference_genome_index, + } + } + + if (run_haplotype_caller) { + call gatk4.germline_variant_calling_wf { + bam = processed_bam, + bam_index = processed_bam_index, + interval_list, + reference_fasta = reference_genome, + reference_fasta_index = reference_genome_index, + reference_dict = reference_genome_dictionary, + dbSNP_vcf, + dbSNP_vcf_index, + known_indels_sites_vcfs, + known_indels_sites_indices, + resources, + } + } + + output { + File? clair_vcf = clair3.pileup_vcf + File? clair_full_vcf = clair3.full_alignment_vcf + File? clair_merged_vcf = clair3.merged_vcf + File? deepvariant_vcf = deepvariant.vcf_output + File? deepvariant_gvcf = deepvariant.gvcf_output + File? deepvariant_runtime_report = deepvariant.runtime_html + File? deepvariant_vcf_stats = deepvariant.vcf_stats + Directory? manta_output = manta_germline.manta_output + File? manta_log = manta_germline.log_file + Array[File]? ngsep_vcf = ngsep_germline_variant.vcf_output + Directory? strelka_output = strelka_germline.strelka_output + File? strelka_log = strelka_germline.log_file + File? haplotype_caller_vcf = germline_variant_calling_wf.vcf_final + File? haplotype_caller_vcf_index = germline_variant_calling_wf.vcf_final_index + } +} diff --git a/workflows/evaluation/somatic-variant-calling.wdl b/workflows/evaluation/somatic-variant-calling.wdl new file mode 100644 index 000000000..2ca5e4443 --- /dev/null +++ b/workflows/evaluation/somatic-variant-calling.wdl @@ -0,0 +1,149 @@ +version 1.3 + +import "../../data_structures/read_group.wdl" +import "../../tools/clair.wdl" +import "../../tools/deepvariant.wdl" +import "../../tools/manta.wdl" +import "../../tools/mutect2.wdl" +import "../../tools/strelka.wdl" + +workflow somatic_variant_calling { + meta { + description: "Workflow for calling somatic variants using multiple tools" + outputs: { + # octopus_vcf: "VCF file with called germline variants from Octopus", + clairs_vcf: "VCF file with called somatic variants from Clair", + deepsomatic_vcf: "VCF file with called somatic variants from DeepSomatic", + manta_output: "Directory containing Manta variant calls", + strelka_output: "Directory containing Strelka somatic variant calls", + mutect2_vcf: "VCF file with filtered somatic variants from Mutect2", + mutect2_vcf_index: "Index file for the Mutect2 filtered somatic variants VCF", + } + allowNestedInputs: true + } + + parameter_meta { + reference_fasta: "Reference genome in FASTA format" + reference_fasta_index: "Index file for the reference genome FASTA" + reference_fasta_dict: "Dictionary file for the reference genome FASTA" + normal_bam: "Input BAM file with aligned reads for normal sample" + normal_bam_index: "Index file for the normal BAM file" + tumor_bam: "Input BAM file with aligned reads for tumor sample" + tumor_bam_index: "Index file for the tumor BAM file" + variant_vcf: "VCF file with variants and allele frequencies to summarize pileups over." + variant_vcf_index: "Index file for the variant VCF" + intervals: "One or more genomic intervals over which to operate. Often the same file as `variants`" + intervals_index: "Index file for the intervals file" + # octopus_forest: "Pre-trained random forest model for Octopus variant filtering" + deepsomatic_model_type: "Type of model to use for DeepSomatic variant calling" + # strelka_indel_candidates: "Optional VCF file with candidate indels for Strelka, recommended to be generated by Manta" + exome: "Whether the data is from exome sequencing, which will adjust filtering in some tools accordingly" + } + + input { + File reference_fasta + File reference_fasta_index + File reference_fasta_dict + File normal_bam + File normal_bam_index + File tumor_bam + File tumor_bam_index + File variant_vcf + File variant_vcf_index + File intervals + File intervals_index + ModelType deepsomatic_model_type = ModelType.WGS + Boolean exome = false + } + + call read_group.get_read_groups as normal_read_groups { + bam = normal_bam, + } + + call read_group.get_read_groups as tumor_read_groups { + bam = tumor_bam, + } + + call deepvariant.deepsomatic { + reference_fasta, + reference_fasta_index, + tumor_bam, + tumor_bam_index, + normal_bam, + normal_bam_index, + output_prefix = "deepsomatic_output", + tumor_sample_name = "tumor", + normal_sample_name = "normal", + model_type = deepsomatic_model_type, + runtime_report = true, + vcf_stats_report = true, + } + + call manta.manta_somatic { + reference_fasta, + reference_fasta_index, + normal_bam, + normal_bam_index, + tumor_bam, + tumor_bam_index, + output_dir = "manta_output", + exome, + } + + call strelka.somatic { + reference_fasta, + reference_fasta_index, + normal_bam, + normal_bam_index, + tumor_bam, + tumor_bam_index, + indel_candidates = manta_somatic.indel_candidates, + indel_candidates_index = manta_somatic.indel_candidates_index, + output_dir = "strelka_output", + exome, + } + + call clair.clairs { + reference_fasta, + reference_fasta_index, + normal_bam, + normal_bam_index, + tumor_bam, + tumor_bam_index, + platform = platform.ilmn, + prefix = "~{basename(tumor_bam, ".bam")}_vs_~{basename(normal_bam, ".bam")}", + sample_name = "~{basename(tumor_bam, ".bam")}", + } + + call mutect2.mutect2_wf { + reference_fasta, + reference_fasta_index, + reference_fasta_dict, + normal_bam, + normal_bam_index, + tumor_bam, + tumor_bam_index, + normal_sample_name = select_first([ + normal_read_groups.read_groups[0].SM, + "normal", + ]), + tumor_sample_name = select_first([ + tumor_read_groups.read_groups[0].SM, + "tumor", + ]), + variant_vcf, + variant_vcf_index, + intervals, + intervals_index, + } + + output { + # File octopus_vcf = octopus.somatic.output_vcf + File clairs_vcf = clairs.vcf + File deepsomatic_vcf = deepsomatic.vcf_output + Directory manta_output = manta_somatic.manta_output + Directory strelka_output = somatic.strelka_output + File mutect2_vcf = mutect2_wf.filtered_somatic_vcf + File mutect2_vcf_index = mutect2_wf.filtered_somatic_vcf_index + } +} diff --git a/workflows/reference/inputs/qc-reference-inputs.json b/workflows/reference/inputs/qc-reference-inputs.json index 29cb830cd..1261bf977 100644 --- a/workflows/reference/inputs/qc-reference-inputs.json +++ b/workflows/reference/inputs/qc-reference-inputs.json @@ -19,7 +19,7 @@ "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/Arabidopsis_thaliana/reference/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz", "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Drosophila_melanogaster/reference/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna.gz", "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/invertebrate/Caenorhabditis_elegans/reference/GCF_000002985.6_WBcel235/GCF_000002985.6_WBcel235_genomic.fna.gz", - "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/Danio_rerio/reference/GCF_000002035.6_GRCz11/GCF_000002035.6_GRCz11_genomic.fna.gz" + "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_other/Danio_rerio/reference/GCF_049306965.1_GRCz12tu/GCF_049306965.1_GRCz12tu_genomic.fna.gz" ], "qc_reference.kraken_fastas": [], "qc_reference.kraken_build_db.db_name": "custom_kraken2_db",