Skip to content

Commit a822bdd

Browse files
authored
Merge pull request #762 from nf-core/fix-database-inputs
Fix parameter input validation for file/directory based parameters
2 parents aed22fb + 7c32cf2 commit a822bdd

File tree

2 files changed

+55
-25
lines changed

2 files changed

+55
-25
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1717
- [#748](https://github.com/nf-core/mag/pull/748) - Fix broken phix reference channel when skipping phix removal (reported by @amizeranschi, fix by @muabnezor)
1818
- [#752](https://github.com/nf-core/mag/pull/752) - Fix QUAST results not being displayed when skipping certain steps (reported by @amizeranschi, fix by @jfy133)
1919
- [#753](https://github.com/nf-core/mag/pull/753) - Fix iGenomes reference support for host removal reference genome (reported by @Thomieh73, fix by @jfy133)
20+
- [#759](https://github.com/nf-core/mag/pull/758) - Fixed parameters that allow both files or directories to not error with directories, and general file input validation improvements (repoted by @mjfi2sb3, fix by @jfy133)
2021

2122
### `Deprecated`
2223

nextflow_schema.json

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"input": {
1616
"type": "string",
1717
"mimetype": "text/csv",
18-
"format": "file-path-pattern",
18+
"format": "file-path",
1919
"exists": true,
2020
"schema": "assets/schema_input.json",
2121
"pattern": "^\\S+\\.csv$",
@@ -32,7 +32,7 @@
3232
"assembly_input": {
3333
"type": "string",
3434
"mimetype": "text/csv",
35-
"format": "file-path-pattern",
35+
"format": "file-path",
3636
"exists": true,
3737
"schema": "assets/schema_assembly_input.json",
3838
"pattern": "^\\S+\\.csv$",
@@ -324,12 +324,16 @@
324324
"host_fasta": {
325325
"type": "string",
326326
"description": "Fasta reference file for host contamination removal.",
327-
"help_text": "This parameter is mutually exclusive with `--host_genome`. The reference can be masked. Host read removal is done with Bowtie2."
327+
"help_text": "This parameter is mutually exclusive with `--host_genome`. The reference can be masked. Host read removal is done with Bowtie2.",
328+
"format": "file-path",
329+
"exists": true
328330
},
329331
"host_fasta_bowtie2index": {
330332
"type": "string",
331333
"description": "Bowtie2 index directory corresponding to `--host_fasta` reference file for host contamination removal.",
332-
"help_text": "This parameter must be used in combination with `--host_fasta`, and should be a directory containing files from the output of `bowtie2-build`, i.e. files ending in `.bt2`"
334+
"help_text": "This parameter must be used in combination with `--host_fasta`, and should be a directory containing files from the output of `bowtie2-build`, i.e. files ending in `.bt2`",
335+
"format": "directory-path",
336+
"exists": true
333337
},
334338
"host_removal_verysensitive": {
335339
"type": "boolean",
@@ -351,7 +355,9 @@
351355
"type": "string",
352356
"default": "${baseDir}/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz",
353357
"description": "Genome reference used to remove Illumina PhiX contaminant reads.",
354-
"hidden": true
358+
"hidden": true,
359+
"format": "file-path",
360+
"exists": true
355361
},
356362
"skip_clipping": {
357363
"type": "boolean",
@@ -419,7 +425,9 @@
419425
"type": "string",
420426
"default": "${baseDir}/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz",
421427
"hidden": true,
422-
"description": "Genome reference used to remove ONT Lambda contaminant reads."
428+
"description": "Genome reference used to remove ONT Lambda contaminant reads.",
429+
"format": "file-path",
430+
"exists": true
423431
},
424432
"save_lambdaremoved_reads": {
425433
"type": "boolean",
@@ -455,21 +463,24 @@
455463
"properties": {
456464
"centrifuge_db": {
457465
"type": "string",
458-
"format": "file-path",
466+
"format": "path",
459467
"exists": true,
460468
"description": "Database for taxonomic binning with centrifuge.",
461469
"help_text": "Local directory containing `*.cf` files, or a URL or local path to a downloaded compressed tar archive of a Centrifuge database. E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz."
462470
},
463471
"kraken2_db": {
464472
"type": "string",
465-
"format": "file-path",
473+
"format": "path",
474+
"exists": true,
466475
"description": "Database for taxonomic binning with kraken2.",
467476
"help_text": "Path to a local directory, archive file, or a URL to compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz."
468477
},
469478
"krona_db": {
470479
"type": "string",
471480
"description": "Database for taxonomic binning with krona",
472-
"help_text": "Path to `taxonomy.tab` file for Krona, instead of downloading the default file. Point at the `.tab` file."
481+
"help_text": "Path to `taxonomy.tab` file for Krona, instead of downloading the default file. Point at the `.tab` file.",
482+
"format": "file-path",
483+
"exists": true
473484
},
474485
"skip_krona": {
475486
"type": "boolean",
@@ -478,7 +489,9 @@
478489
"cat_db": {
479490
"type": "string",
480491
"description": "Database for taxonomic classification of metagenome assembled genomes. Can be either a zipped file or a directory containing the extracted output of such.",
481-
"help_text": "E.g. https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz. This parameter is mutually exclusive with `--cat_db_generate`. The file needs to contain a folder named `*taxonomy*` and `*database*` that hold the respective files."
492+
"help_text": "E.g. https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz. This parameter is mutually exclusive with `--cat_db_generate`. The file needs to contain a folder named `*taxonomy*` and `*database*` that hold the respective files.",
493+
"format": "path",
494+
"exists": true
482495
},
483496
"cat_db_generate": {
484497
"type": "boolean",
@@ -501,31 +514,35 @@
501514
"gtdb_db": {
502515
"type": "string",
503516
"description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.",
504-
"default": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz"
517+
"default": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
518+
"format": "path",
519+
"exists": true
505520
},
506521
"gtdb_mash": {
507522
"type": "string",
508-
"description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step"
523+
"description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step",
524+
"format": "path",
525+
"exists": true
509526
},
510527
"gtdbtk_min_completeness": {
511528
"type": "number",
512-
"default": 50.0,
529+
"default": 50,
513530
"description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
514531
"help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
515532
"minimum": 0.01,
516533
"maximum": 100
517534
},
518535
"gtdbtk_max_contamination": {
519536
"type": "number",
520-
"default": 10.0,
537+
"default": 10,
521538
"description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
522539
"help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
523540
"minimum": 0,
524541
"maximum": 100
525542
},
526543
"gtdbtk_min_perc_aa": {
527544
"type": "number",
528-
"default": 10.0,
545+
"default": 10,
529546
"description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
530547
"minimum": 0,
531548
"maximum": 100
@@ -547,11 +564,6 @@
547564
"type": "boolean",
548565
"description": "Speed up pplacer step of GTDB-Tk by loading to memory.",
549566
"help_text": "Will be faster than writing to disk (default setting), however at the expense of much larger memory (RAM) requirements for GDTBTK/CLASSIFY."
550-
},
551-
"genomad_db": {
552-
"type": "string",
553-
"description": "Database for virus classification with geNomad",
554-
"help_text": "Must be a directory containing the uncompressed contents from https://zenodo.org/doi/10.5281/zenodo.6994741 (nf-core/mag tested with v1.1)"
555567
}
556568
}
557569
},
@@ -629,7 +641,9 @@
629641
"metaeuk_db": {
630642
"type": "string",
631643
"description": "Path to either a local fasta file of protein sequences, or to a directory containing an mmseqs2-formatted database, for annotation of eukaryotic genomes.",
632-
"help_text": "One option would be the databases from the MetaEuk publication (https://wwwuser.gwdg.de/~compbiol/metaeuk/), however it should be noted that these are focused on marine eukaryotes."
644+
"help_text": "One option would be the databases from the MetaEuk publication (https://wwwuser.gwdg.de/~compbiol/metaeuk/), however it should be noted that these are focused on marine eukaryotes.",
645+
"format": "file-path",
646+
"exists": true
633647
},
634648
"save_mmseqs_db": {
635649
"type": "boolean",
@@ -646,6 +660,13 @@
646660
"type": "boolean",
647661
"description": "Run virus identification."
648662
},
663+
"genomad_db": {
664+
"type": "string",
665+
"description": "Database for virus classification with geNomad",
666+
"help_text": "Must be a directory containing the uncompressed contents from https://zenodo.org/doi/10.5281/zenodo.6994741 (nf-core/mag tested with v1.1)",
667+
"format": "path",
668+
"exists": true
669+
},
649670
"genomad_min_score": {
650671
"type": "number",
651672
"default": 0.7,
@@ -757,7 +778,9 @@
757778
"busco_db": {
758779
"type": "string",
759780
"description": "Download URL for BUSCO lineage dataset, or path to a tar.gz archive, or local directory containing already downloaded and unpacked lineage datasets.",
760-
"help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/."
781+
"help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/.",
782+
"format": "path",
783+
"exists": true
761784
},
762785
"busco_auto_lineage_prok": {
763786
"type": "boolean",
@@ -783,7 +806,9 @@
783806
"checkm_db": {
784807
"type": "string",
785808
"description": "Path to local folder containing already downloaded and uncompressed CheckM database.",
786-
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`."
809+
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`.",
810+
"format": "directory-path",
811+
"exists": true
787812
},
788813
"save_checkm_data": {
789814
"type": "boolean",
@@ -793,7 +818,9 @@
793818
"checkm2_db": {
794819
"type": "string",
795820
"description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).",
796-
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`)."
821+
"help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`).",
822+
"format": "directory-path",
823+
"exists": true
797824
},
798825
"checkm2_db_version": {
799826
"type": "integer",
@@ -828,7 +855,9 @@
828855
},
829856
"gunc_db": {
830857
"type": "string",
831-
"description": "Specify a path to a pre-downloaded GUNC dmnd database file"
858+
"description": "Specify a path to a pre-downloaded GUNC dmnd database file",
859+
"format": "file-path",
860+
"exists": true
832861
},
833862
"gunc_database_type": {
834863
"type": "string",

0 commit comments

Comments
 (0)