Skip to content

Commit

Permalink
Merge pull request #6393 from RZ9082/seqtk
Browse files Browse the repository at this point in the history
Ensure correct output formats of seqtk_seq and seqtk_mergefa in all cases
  • Loading branch information
bgruening authored Oct 16, 2024
2 parents fb87911 + a510db3 commit 75d5141
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 25 deletions.
50 changes: 32 additions & 18 deletions tools/seqtk/seqtk_mergefa.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0"?>
<tool id="seqtk_mergefa" name="seqtk_mergefa" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
<description>merge two FASTA/Q files</description>
<tool id="seqtk_mergefa" name="seqtk_mergefa" version="@TOOL_VERSION@+galaxy1" profile="22.05">
<description>Merge two FASTA/Q files into a FASTA file output</description>
<macros>
<import>macros.xml</import>
</macros>
Expand All @@ -18,17 +18,28 @@ $h
'$in_fa2'
#echo "| pigz -p ${GALAXY_SLOTS:-1} --no-name --no-time" if $in_fa1.is_of_type('fasta.gz', 'fastq.gz') else "" # > '$default'
]]></command>
<configfiles>
<configfile filename="outputs.json">
#set $ext = None
#if $in_fa1.is_of_type('fasta.gz', 'fastq.gz')
#set $ext = "fasta.gz"
#else
#set $ext = "fasta"
#end if
{"default": {"ext": "$ext"}}
</configfile>
</configfiles>
<inputs>
<param name="in_fa1" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/Q file #1"/>
<param name="in_fa2" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input FASTA/Q file #2"/>
<param argument="-q" type="integer" value="0" label="Quality threshold"/>
<param argument="-q" type="integer" value="0" label="Quality threshold (for FASTQ)"/>
<param argument="-i" type="boolean" truevalue="-i" falsevalue="" checked="false" label="Take intersection" />
<param argument="-m" type="boolean" truevalue="-m" falsevalue="" checked="false" label="Convert to lowercase when one of the input base is N" />
<param argument="-m" type="boolean" truevalue="-m" falsevalue="" checked="false" label="Pick least ambiguous, mask conflicts and uncertainties" help="Tries to pick the least ambiguous symbol from the two inputs, but masks contradictory bases in the inputs as x in the merged result and converts the merged base to lowercase where one of the input bases is an N." />
<param argument="-r" type="boolean" truevalue="-r" falsevalue="" checked="false" label="Pick a random allele from het" />
<param argument="-h" type="boolean" truevalue="-h" falsevalue="" checked="false" label="Suppress hets in the input" />
</inputs>
<outputs>
<data name="default" format_source="in_fa1" label="${tool.name} on ${on_string}"/>
<outputs provided_metadata_file="outputs.json">
<data name="default" format="auto" label="${tool.name} on ${on_string}" />
</outputs>
<tests>
<test>
Expand All @@ -52,24 +63,27 @@ $h
<help><![CDATA[
**What it does**
Merges two fasta files, using ambiguity codes
This tool merges two FASTA or FASTQ files into a single FASTA file using IUPAC ambiguity codes where appropriate.
When differences occur between the sequences, ambiguity codes are used to represent possible variations.
::
Example::
# seq1.fa
>test0
ACTGACTGAAA
>seq1
ACTGACTGAAA
# seq2.fa
>test0
ACTGAMTGCGN
>seq2
ACTGAMTGCGN
In the following the `-m` option has been set to highlight seqtk-mergefa's features.
will result in::
::
>seq1
ACTGAMTGMRN
>test0
ACTGACTGxxa
If the `-m` option is in use, however, the tool will pick the least ambiguous base if there is no contradiction between the symbols in the inputs. Conflicts are indicated by using x in the merged sequence and the picked base is converted to lowercase if the less specific symbol is an N to express uncertainty.
With this logic the input sequences above will result in the merge result::
>seq1
ACTGACTGxxa
@ATTRIBUTION@
]]></help>
Expand Down
37 changes: 30 additions & 7 deletions tools/seqtk/seqtk_seq.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0"?>
<tool id="seqtk_seq" name="seqtk_seq" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
<tool id="seqtk_seq" name="seqtk_seq" version="@TOOL_VERSION@+galaxy1" profile="22.05">
<description>common transformation of FASTA/Q</description>
<macros>
<import>macros.xml</import>
Expand Down Expand Up @@ -34,13 +34,25 @@ $x2
'$in_file'
@CONDITIONAL_GZIP_OUT@
]]></command>
<configfiles>
<configfile filename="outputs.json">
#if $A and $in_file.is_of_type('fasta.gz', 'fastq.gz')
#set $ext = "fasta.gz"
#elif $A
#set $ext = "fasta"
#else
#set $ext = $in_file.ext
#end if
{"default": {"ext": "$ext"}}
</configfile>
</configfiles>
<inputs>
<expand macro="in_faq"/>
<param argument="-q" type="integer" value="0" label="Mask bases with quality lower than INT" />
<param argument="-X" type="integer" value="255" label="Mask bases with quality higher than INT" />
<param argument="-n" type="text" value="0" label="Masked bases converted to CHAR; 0 for lowercase" />
<param argument="-n" type="text" value="" label="Masked bases converted to CHAR; leave empty for lowercase masking" />
<param argument="-l" type="integer" value="0" label="Number of residues per line; 0 for 2^32-1" />
<param argument="-Q" type="integer" value="33" label="Quality shift: ASCII-INT gives base quality" />
<param argument="-Q" type="integer" value="33" label="Quality shift: ASCII-INT gives base quality" help="Only applied during comparison to quality thresholds for masking" />
<param argument="-s" type="integer" value="11" label="Random seed" help="Effective with -f" />
<param argument="-f" type="float" value="1" label="Sample fraction of sequences" />
<param argument="-M" type="data" format="bed,txt" optional="true" label="Mask regions in BED or name list file" />
Expand All @@ -53,26 +65,37 @@ $x2
<param name="x1" argument="-1" type="boolean" truevalue="-1" falsevalue="" checked="false" label="Output the 2n-1 reads only" />
<param name="x2" argument="-2" type="boolean" truevalue="-2" falsevalue="" checked="false" label="Output the 2n reads only" />
</inputs>
<outputs>
<data name="default" format_source="in_file" label="${tool.name} on ${on_string}" />
<outputs provided_metadata_file="outputs.json">
<data name="default" format="auto" label="${tool.name} on ${on_string}" />
</outputs>

<tests>
<!-- This is a sorry excuse for a test for a tool which does way more
than it should, but upstream decided to put a TON of functionality
into a single tool rather than using the single responsibility
principle. -->
<test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_seq.fa"/>
<param name="r" value="True"/>
<param name="n" value=""/>
<output name="default" file="seqtk_seq_revcom.fa" ftype="fasta"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_seq.fa.gz" ftype="fasta.gz"/>
<param name="r" value="True"/>
<param name="n" value=""/>
<output name="default" file="seqtk_seq_revcom.fa.gz" ftype="fasta.gz"/>
</test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_trimfq.fq" ftype="fastq"/>
<param name="A" value="True" />
<output name="default" file="seqtk_seq_A.fasta" ftype="fasta"/>
</test>
<test expect_num_outputs="1">
<param name="in_file" value="seqtk_trimfq.fq.gz" ftype="fastq.gz"/>
<param name="A" value="True" />
<output name="default" file="seqtk_seq_A.fasta.gz" ftype="fasta.gz"/>
</test>
</tests>
<help><![CDATA[
**What it does**
Expand Down
2 changes: 2 additions & 0 deletions tools/seqtk/test-data/seqtk_seq_A.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>SEQ_ID1
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
Binary file added tools/seqtk/test-data/seqtk_seq_A.fasta.gz
Binary file not shown.

0 comments on commit 75d5141

Please sign in to comment.