Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure correct output formats of seqtk_seq and seqtk_mergefa in all cases #6393

Merged
merged 16 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions tools/seqtk/seqtk_mergefa.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0"?>
<tool id="seqtk_mergefa" name="seqtk_mergefa" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
<description>merge two FASTA/Q files</description>
<tool id="seqtk_mergefa" name="seqtk_mergefa" version="@TOOL_VERSION@+galaxy1" profile="22.05">
<description>Merge two FASTA/Q files into a FASTA file output</description>
<macros>
<import>macros.xml</import>
</macros>
Expand All @@ -18,17 +18,28 @@ $h
'$in_fa2'
#echo "| pigz -p ${GALAXY_SLOTS:-1} --no-name --no-time" if $in_fa1.is_of_type('fasta.gz', 'fastq.gz') else "" # > '$default'
]]></command>
<configfiles>
<configfile filename="outputs.json">
#set $ext = None
#if $in_fa1.is_of_type('fasta.gz', 'fastq.gz')
#set $ext = "fasta.gz"
#else
#set $ext = "fasta"
#end if
{"default": {"ext": "$ext"}}
</configfile>
</configfiles>
<inputs>
<param name="in_fa1" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/Q file #1"/>
<param name="in_fa2" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/Q file #2"/>
<param argument="-q" type="integer" value="0" label="Quality threshold"/>
<param argument="-q" type="integer" value="0" label="Quality threshold (for FASTQ)"/>
<param argument="-i" type="boolean" truevalue="-i" falsevalue="" checked="false" label="Take intersection" />
<param argument="-m" type="boolean" truevalue="-m" falsevalue="" checked="false" label="Convert to lowercase when one of the input base is N" />
<param argument="-m" type="boolean" truevalue="-m" falsevalue="" checked="false" label="Pick least ambiguous, mask conflicts and uncertainties" help="Tries to pick the least ambiguous symbol from the two inputs, but masks contradictory bases in the inputs as x in the merged result and converts the merged base to lowercase where one of the input bases is an N." />
<param argument="-r" type="boolean" truevalue="-r" falsevalue="" checked="false" label="Pick a random allele from het" />
<param argument="-h" type="boolean" truevalue="-h" falsevalue="" checked="false" label="Suppress hets in the input" />
</inputs>
<outputs>
<data name="default" format_source="in_fa1" label="${tool.name} on ${on_string}"/>
<outputs provided_metadata_file="outputs.json">
<data name="default" format="auto" label="${tool.name} on ${on_string}" />
</outputs>
<tests>
<test>
Expand All @@ -52,24 +63,27 @@ $h
<help><![CDATA[
**What it does**

Merges two fasta files, using ambiguity codes
This tool merges two FASTA or FASTQ files into a single FASTA file using IUPAC ambiguity codes where appropriate.
When differences occur between the sequences, ambiguity codes are used to represent possible variations.

::
Example::

# seq1.fa
>test0
ACTGACTGAAA
>seq1
ACTGACTGAAA

# seq2.fa
>test0
ACTGAMTGCGN
>seq2
ACTGAMTGCGN

In the following the `-m` option has been set to highlight seqtk-mergefa's features.
will result in::

::
>seq1
ACTGAMTGMRN

>test0
ACTGACTGxxa
If the `-m` option is in use, however, the tool will pick the least ambiguous base if there is no contradiction between the symbols in the inputs. Conflicts are indicated by using x in the merged sequence and the picked base is converted to lowercase if the less specific symbol is an N to express uncertainty.
With this logic the input sequences above will result in the merge result::

>seq1
ACTGACTGxxa

@ATTRIBUTION@
]]></help>
Expand Down
37 changes: 30 additions & 7 deletions tools/seqtk/seqtk_seq.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0"?>
<tool id="seqtk_seq" name="seqtk_seq" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
<tool id="seqtk_seq" name="seqtk_seq" version="@TOOL_VERSION@+galaxy1" profile="22.05">
<description>common transformation of FASTA/Q</description>
<macros>
<import>macros.xml</import>
Expand Down Expand Up @@ -34,13 +34,25 @@ $x2
'$in_file'
@CONDITIONAL_GZIP_OUT@
]]></command>
<configfiles>
<configfile filename="outputs.json">
#if $A and $in_file.is_of_type('fasta.gz', 'fastq.gz')
#set $ext = "fasta.gz"
#elif $A
#set $ext = "fasta"
#else
#set $ext = $in_file.ext
#end if
{"default": {"ext": "$ext"}}
</configfile>
</configfiles>
<inputs>
<expand macro="in_faq"/>
<param argument="-q" type="integer" value="0" label="Mask bases with quality lower than INT" />
<param argument="-X" type="integer" value="255" label="Mask bases with quality higher than INT" />
<param argument="-n" type="text" value="0" label="Masked bases converted to CHAR; 0 for lowercase" />
<param argument="-n" type="text" value="" label="Masked bases converted to CHAR; leave empty for lowercase masking" />
<param argument="-l" type="integer" value="0" label="Number of residues per line; 0 for 2^32-1" />
<param argument="-Q" type="integer" value="33" label="Quality shift: ASCII-INT gives base quality" />
<param argument="-Q" type="integer" value="33" label="Quality shift: ASCII-INT gives base quality" help="Only applied during comparison to quality thresholds for masking" />
<param argument="-s" type="integer" value="11" label="Random seed" help="Effective with -f" />
<param argument="-f" type="float" value="1" label="Sample fraction of sequences" />
<param argument="-M" type="data" format="bed,txt" optional="true" label="Mask regions in BED or name list file" />
Expand All @@ -53,26 +65,37 @@ $x2
<param name="x1" argument="-1" type="boolean" truevalue="-1" falsevalue="" checked="false" label="Output the 2n-1 reads only" />
<param name="x2" argument="-2" type="boolean" truevalue="-2" falsevalue="" checked="false" label="Output the 2n reads only" />
</inputs>
<outputs>
<data name="default" format_source="in_file" label="${tool.name} on ${on_string}" />
<outputs provided_metadata_file="outputs.json">
<data name="default" format="auto" label="${tool.name} on ${on_string}" />
</outputs>

<tests>
<!-- This is a sorry excuse for a test for a tool which does way more
than it should, but upstream decided to put a TON of functionality
into a single tool rather than using the single responsibility
principle. -->
<test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_seq.fa"/>
<param name="r" value="True"/>
<param name="n" value=""/>
<output name="default" file="seqtk_seq_revcom.fa" ftype="fasta"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_seq.fa.gz" ftype="fasta.gz"/>
<param name="r" value="True"/>
<param name="n" value=""/>
<output name="default" file="seqtk_seq_revcom.fa.gz" ftype="fasta.gz"/>
</test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_trimfq.fq" ftype="fastq"/>
<param name="A" value="True" />
<output name="default" file="seqtk_seq_A.fasta" ftype="fasta"/>
</test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_trimfq.fq.gz" ftype="fastq.gz"/>
<param name="A" value="True" />
<output name="default" file="seqtk_seq_A.fasta.gz" ftype="fasta.gz"/>
</test>
</tests>
<help><![CDATA[
**What it does**
Expand Down
2 changes: 2 additions & 0 deletions tools/seqtk/test-data/seqtk_seq_A.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>SEQ_ID1
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
Binary file added tools/seqtk/test-data/seqtk_seq_A.fasta.gz
Binary file not shown.
Loading