-
Notifications
You must be signed in to change notification settings - Fork 0
/
mergePjson.pl
executable file
·119 lines (97 loc) · 2.63 KB
/
mergePjson.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use Getopt::Long;
use Hash::Merge 'merge';
use JSON;
use FindBin '$Bin';
use lib $Bin;
use Common;
my $usage = <<USAGE;
Usage: $0 [options] <data files>
Options
-------
-t, --tabular : Output two columns, the first with the aseq_id
and the second with the json
All files must be tab-delimited with the first column
containing the aseq_id and have a gzip extension. Moreover,
all files should be in pseudo json.
USAGE
die $usage if (@ARGV == 0);
my $g_Tabular;
GetOptions("t|tabular", \$g_Tabular);
my %sources = ();
my %data = (); # {fieldName => {aseq_id, decoded json}}
my $i = 0;
foreach my $file (@ARGV) {
my $fh = &Common::openFileOrGzFile($file);
$sources{$i} = $fh;
++$i;
}
$i = 0;
&Common::startTicker();
&initializeData();
while (my $aseq_id = &nextLowestAseqId()) {
my $rowData = &dataForAseqId($aseq_id);
print $rowData->{_id}, "\t" if ($g_Tabular);
print to_json($rowData), "\n";
&Common::tick();
}
if (scalar(keys %data) > 0) {
print STDERR qq(Unmerged data:\n);
print STDERR Dumper(\%data);
}
if (scalar(keys %sources) > 0) {
print STDERR qq(Unfinished sources\n);
print STDERR Dumper(\%sources);
}
# --------------------------------------------------------------------------------------------------------------------
sub nextLowestAseqId {
my $lowest = undef;
foreach my $num (keys %data) {
my $aseqId = $data{$num}->{aseq_id};
$lowest = $aseqId if (!defined($lowest) || $aseqId lt $lowest);
}
return $lowest;
}
sub dataForAseqId {
my $aseqId = shift;
my $merged = {};
foreach my $num (keys %data) {
my $ref = $data{$num};
my $otherAseqId = $ref->{aseq_id};
if ($otherAseqId eq $aseqId) {
if ($merged->{_s} && $ref->{data}->{_s}) {
$merged->{_s} = &Common::mergeStatuses($merged->{_s}, $ref->{data}->{_s});
}
$merged = merge($merged, $ref->{data});
&readNextRecord($num);
}
}
return $merged;
}
sub readNextRecord {
my $num = shift;
my $fh = $sources{$num};
my $line = $fh->getline();
if ($line) {
chomp($line);
my ($aseqId, $json) = split(/\t/, $line, 2);
$data{$num} = {
aseq_id => $aseqId,
data => from_json($json)
};
}
else {
$fh->close();
delete $sources{$num};
delete $data{$num};
}
}
# Reads the first record from all handles for each field type
sub initializeData {
foreach my $num (keys %sources) {
&readNextRecord($num);
}
}