Skip to content

Commit

Permalink
minor update and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangchaolin committed Aug 4, 2016
1 parent 42c1a6c commit edc3d15
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 22 deletions.
6 changes: 3 additions & 3 deletions Bed.pm
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ sub readBedFile
my $i = 0;
while (my $bedLine = readNextBedLine ($fin))
{
print "$i ...\n" if $verbose && $i % 100000 == 0;
print STDERR "$i ...\n" if $verbose && $i % 100000 == 0;
$i++;
push @ret, $bedLine;
}
Expand Down Expand Up @@ -489,7 +489,7 @@ sub splitBedFileByChrom

$i++;

print "$i ...\n" if $i % 100000 == 0 && $verbose;
print STDERR "$i ...\n" if $i % 100000 == 0 && $verbose;

$line =~/(\S+)\s/;

Expand Down Expand Up @@ -528,7 +528,7 @@ sub splitBedFileByChrom

if ($sort)
{
print "sorting $chrom ...\n" if $verbose;
print STDERR "sorting $chrom ...\n" if $verbose;
my $f = $tagCount{$chrom}->{'f'};
my $f2 = "$f.sort";

Expand Down
69 changes: 67 additions & 2 deletions Common.pm
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,17 @@ our $VERSION = 1.01;
chop_carriage
clean_rep
cnk
cnkln
count2prob
dbinom
diffArray
entropy
enumerateKofN
getFullPath
getTaxId
gscore
hypergeoTest
hypergeo
intersectArray
list_to_rep
locateArrayElem
Expand Down Expand Up @@ -527,7 +531,7 @@ sub stdev

my $m = mean ($array);

map {$sum+= ($_-$m) ^2} @$array;
map {$sum+= ($_-$m) **2} @$array;

my $n = @$array;
return sqrt ($sum/($n-1));
Expand All @@ -552,7 +556,7 @@ sub norm
my $array = $_[0];
my $sum = 0;

map {$sum+= $_ ^2} @$array;
map {$sum+= $_ **2} @$array;

#my $elem;
#foreach $elem (@$array)
Expand All @@ -563,6 +567,20 @@ sub norm
return $sum;
}

sub count2prob
{
die "count2prob: incorrect number of parameters.\n" if @_ != 1 && @_!= 2;
my ($array, $pseudoCount) = @_;
$pseudoCount = 0 unless $pseudoCount;

my $s = sum ($array);
my $n = @$array;

my @ret = map{($_+$pseudoCount)/($s+$pseudoCount * $n)} @$array;
return \@ret;
}



sub max
{
Expand Down Expand Up @@ -905,6 +923,53 @@ sub dbinom
}


#hypergeometric probability

sub hypergeoTest
{
my ($m, $k, $N, $n) = @_;

my $pval = 0;
for (my $i = $k; $i <= Common::min($n, $m); $i++)
{
my $p = hypergeo ($m, $i, $N, $n);
#print "hypergeo ($m, $i, $N, $n) = $p ...\n";
$pval += $p;
}
return $pval;
}

sub hypergeo
{
my ($m, $k, $N, $n) = @_;
my $logP = cnkln($m, $k) + cnkln ($N-$m, $n-$k) - cnkln ($N, $n);
return exp($logP);
}

{
my @cache;
$cache[0] = 0;
$cache[1] = 0;

sub factorln
{
my $n = shift;
return $cache[$n] if defined $cache[$n];
return undef if $n < 0;
for (my $i = scalar(@cache); $i <= $n; $i++)
{
$cache[$i] = $cache[$i-1] + log($i);
}
return $cache[$n];
}
}

sub cnkln
{
my ($n, $k) = @_;
return factorln($n) - factorln($k) - factorln($n-$k);
}


#/////////////////////////Clustering////////////////////////////////
sub matrix2clusters
Expand Down
18 changes: 14 additions & 4 deletions Motif.pm
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ our $VERSION = 1.01;
getMatrixScore
getMaxMatrixScore
getMinMatrixScore
maskWord
printMotif
readMotifFile
readMatchFile
Expand Down Expand Up @@ -743,7 +744,7 @@ sub maskWord

sub countMismatch
{
my ($w1, $w2, $ignoreCase) = @_;
my ($w1, $w2, $ignoreCase, $maxDiff) = @_;

$w1 =~tr/a-z/A-Z/ if $ignoreCase;
$w2 =~tr/a-z/A-Z/ if $ignoreCase;
Expand All @@ -753,16 +754,25 @@ sub countMismatch
my @w1 = split (//, $w1);
my @w2 = split (//, $w2);

return countMismatchBase (\@w1, \@w2);
return countMismatchBase (\@w1, \@w2, $maxDiff);
}

#we assume $w1 and $w2 has the same length
#
# my $n = countMismatchBase (\@w1, \@w2, $maxDiff);

sub countMismatchBase
{
my ($w1, $w2) = @_;
my ($w1, $w2, $maxDiff) = @_;
my $n = 0;
my $l = @$w1;
map {$n++ if $w1->[$_] ne $w2->[$_]} (0 .. ($l-1));
$maxDiff = $l unless $maxDiff;

for (my $i = 0; $i < $l; $i++)
{
$n++ if $w1->[$i] ne $w2->[$i];
last if $n >= $maxDiff;
}
return $n;
}

Expand Down
5 changes: 5 additions & 0 deletions MyConfig.pm
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ sub getGenomeDir
return "$genomeDir/hg19" if ($species eq 'hg19');
return "$genomeDir/panTro3" if ($species eq 'panTro3');
return "$genomeDir/panTro4" if ($species eq 'panTro4');
return "$genomeDir/papAnu2" if ($species eq 'papAnu2');
return "$genomeDir/calJac3" if ($species eq 'calJac3');
return "$genomeDir/saiBol1" if ($species eq 'saiBol1');
return "$genomeDir/micMur1" if ($species eq 'micMur1');
return "$genomeDir/ponAbe2" if ($species eq 'ponAbe2');
return "$genomeDir/gorGor3" if ($species eq 'gorGor3');
return "$genomeDir/rheMac2" if ($species eq 'rheMac2');
Expand All @@ -95,6 +99,7 @@ sub getGenomeDir
return "$genomeDir/dm2" if ($species eq 'dm2');
return "$genomeDir/ce2" if ($species eq 'ce2');
return "$genomeDir/sacCer1" if ($species eq 'sacCer1');
return "$genomeDir/danRer10" if ($species eq 'danRer10');
Carp::croak "can not find the directory for $species\n";
}

Expand Down
64 changes: 64 additions & 0 deletions Quantas.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ our $VERSION = 1.01;

@EXPORT = qw (
readConfigFile
readASConfigFile
readExprConfigFile
readExprDataFile
readASDataFile
readBed6DataFile
Expand All @@ -48,6 +50,11 @@ use Carp;
use Common;


=head2
readConfigFile - obsolete now
=cut

sub readConfigFile
{
my ($configFile, $base, $type) = @_;
Expand Down Expand Up @@ -78,6 +85,63 @@ sub readConfigFile
return \%groups;
}

sub readASConfigFile
{
my ($configFile, $base, $type) = @_;
my $fin;
open ($fin, "<$configFile") || Carp::croak "cannot open file $configFile to read\n";
my $i = 0;
my %groups;

while (my $line = <$fin>)
{
chomp $line;
next if $line=~/^\s*$/;
next if $line=~/^\#/;
my ($sampleName, $groupName) = split (/\t/, $line);
$groups{$groupName}->{"id"} = $i++ unless exists $groups{$groupName};
push @{$groups{$groupName}->{"samples"}}, $sampleName;

#check whether input file exists
my $inputFile = $base ne '' ? "$base/$sampleName" : $sampleName;
if (-d $inputFile)
{
Carp::croak "undefined type?\n" unless $type;
$inputFile = "$inputFile/$type.count.txt";
}
Carp::croak "Input file $inputFile does not exist\n" unless -f $inputFile;
}
close ($fin);
return \%groups;
}

sub readExprConfigFile
{
my ($configFile, $base, $suffix) = @_;
my $fin;
open ($fin, "<$configFile") || Carp::croak "cannot open file $configFile to read\n";
my $i = 0;
my %groups;

while (my $line = <$fin>)
{
chomp $line;
next if $line=~/^\s*$/;
next if $line=~/^\#/;
my ($sampleName, $groupName) = split (/\t/, $line);

$sampleName .= $suffix if $suffix;

$groups{$groupName}->{"id"} = $i++ unless exists $groups{$groupName};
push @{$groups{$groupName}->{"samples"}}, $sampleName;

#check whether input file exists
my $inputFile = $base ne '' ? "$base/$sampleName" : $sampleName;
Carp::croak "Input file $inputFile does not exist\n" unless -f $inputFile;
}
close ($fin);
return \%groups;
}


=head2 readExprDataFile
Expand Down
1 change: 1 addition & 0 deletions Sam.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ our $VERSION = 1.01;

@EXPORT = qw (
lineToSam
decodeSamFlag
readSamFile
samToLine
samToBed
Expand Down
51 changes: 39 additions & 12 deletions Sequence.pm
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@

package Sequence;

use Bio::SeqIO;

use Common;
use Bed;

require Exporter;


our $VERSION = 1.01;
our $VERSION = 1.02;


@ISA = qw (Exporter);
Expand Down Expand Up @@ -222,25 +224,46 @@ sub writeFastaSeq

=head2 wordcount
count the number of all n-mer in a DNA/RNA sequence (case insensitive, i.e. sequeuence will be converted into upper case)
count the number of all n-mer in one or more DNA/RNA sequences (case insensitive, i.e. sequeuence will be converted into upper case)
my $wordHash = wordcount ($seqStr, $wordSize)
my $wordHash = wordcount ($seqStr, $wordSize, {wordHash=>$oldWordHash)
my $wordHash = wordcount ($inFastaFile, $wordSize, {fasta=>1})
=cut

sub wordcount
{
my ($seqStr, $wordSize) = @_;
my $seqLen = length ($seqStr);
$seqStr = uc ($seqStr);
my %wordHash;
for (my $i = 0; $i < length ($seqStr) - $wordSize + 1; $i++)
my ($seqStr, $wordSize, $opt) = @_;

my $isFasta = exists $opt->{'fasta'} ? $opt->{'fasta'} : 0;
my $wordHash = exists $opt->{'wordHash'} ? $opt->{'wordHash'} : {};
my $noMask = exists $opt->{'noMask'} ? $opt->{'noMask'} : 0;

if ($isFasta)
{
my $w = substr ($seqStr, $i, $wordSize);
next if $w=~/[^ACGT]/;
$wordHash{$w}++;
my $inFile = $seqStr;
my $seqIO = Bio::SeqIO->new (-file =>$inFile, -format => 'Fasta');
while (my $seq = $seqIO->next_seq())
{
$wordHash = wordcount ($seq->seq(), $wordSize, {wordHash=>$wordHash, noMask=>$noMask});
}
}
else
{
my $seqLen = length ($seqStr);
$seqStr = uc ($seqStr);
for (my $i = 0; $i < length ($seqStr) - $wordSize + 1; $i++)
{
my $w = substr ($seqStr, $i, $wordSize);
if ($noMask == 0)
{
next if $w=~/[^ACGTU]/;
}
$wordHash->{$w}++;
}
}
return \%wordHash;
return $wordHash;
}


Expand Down Expand Up @@ -543,7 +566,11 @@ sub bedToSeq
{
my $blockStart = $bed->{'chromStart'} + $bed->{'blockStarts'}->[$i];
my $blockSeqStr = substr ($chromSeq->{"seq"}, $blockStart, $bed->{'blockSizes'}->[$i]);
Carp::croak "incorrect size for transcript $name, block $i\n" if length ($blockSeqStr) != $bed->{'blockSizes'}->[$i];
if (length ($blockSeqStr) != $bed->{'blockSizes'}->[$i])
{
warn "incorrect size for transcript $name, block $i\n";
return "";
}
$seqStr .= $blockSeqStr;
}

Expand Down
2 changes: 1 addition & 1 deletion ViennaRNA.pm
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ sub readRNAduplexFile
{
next if $line=~/^\s*$/;
next if $line=~/^\>/;
$line =~/^(\S*?)\s+(\d+)\,(\d+)\s+\:\s+(\d+)\,(\d+)\s+\(\s*(\S*?)\)$/;
$line =~/^(\S*?)\s+(\d+)\,(\d+)\s+\:\s+(\d+)\,(\d+)\s+\(\s*(\S*?)\).*$/;
my %entry = (struct=>$1,
targetStart=>$2,
targetEnd=>$3,
Expand Down

0 comments on commit edc3d15

Please sign in to comment.