-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathselectSites.pl
771 lines (659 loc) · 20.2 KB
/
selectSites.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
#!/usr/bin/perl
my $usage="Usage: $0 [-g] [-n replacementChar] [-x n] [-s siteList] [-f siteListFile] [-r [1,2,3]] [-cd] [-i splicingData] fastaFile\n" .
" -g : replace unwanted sites with - instead of removing them\n" .
" -n : replace unwanted sites with replacementChar\n" .
" -x n: Remove all gap sites after sites are selected. If n=1, all\n".
" gap-only-sites are removed. If n=3, the gap-only-sites are\n".
" removed by multiple of n(=3). With 7 neighboring gap-only sites,\n".
" 6 of them will be removed, and 1 site will be left. n can be any integer.\n".
" -s : specify the site lists, do not put any spaces between elements\n" .
" example siteList : 1-4,8,90-\n" .
" -r '1,2' : first and 2nd position of the codon triplets are selected\n".
" -f : read site list from a file.\n".
" -c : site list is codon number\n" .
" -i : individual splicing data for each sequences are given\n" .
" -e : with -i, sequence not listed in the splicingData is excluded\n" .
" -d : delete (instead of select) the sites specified\n" .
"\n".
" For -f option, you can use spaces, tab, comma, or new-line delimited\n".
" site numbers. You can use the range specifiers (see -s), but be\n".
" CAREFUL not to use spaces around '-'; e.g. 1-4 is ok, but 1 - 4 is not.\n".
" In this file, you can also add comments. For each line, any characters\n".
" after the 1st # is considered as comments and ignored.\n".
" Example of site list file:\n\n" .
"# This file can be given to -f\n".
" 1-4, 8 # COMMENT: first 5 bases\n".
"90-\n".
"# end of file\n";
my $sep = "\t"; # if you use tab in the sequence name, change this to
# other characters such as ","
my $lineWidth = 70; # used to break the long sequences into lines for FASTA out
use Getopt::Std;
getopts('hx:i:ecgn:df:s:r:') || die "$usage\n";
if (defined($opt_h)) {
die "$usage\n";
}
die "$usage\n" if (@ARGV > 1);
@ARGV = ('-') unless @ARGV; # take STDIN when no arg.
my $dnaFile = shift @ARGV;
# initialize the @seqArray, @seqNameArray, and $maxSeqLen
my @dat = ReadInFASTA($dnaFile);
my $numSeq = @dat;
my $maxLen = MaxSeqLen(@dat);
if (defined($opt_c)) {
$maxLen = int (($maxLen + 2) / 3);
}
my $replaceChar = "-"; # for -g, this character is used to replace
if (defined($opt_g) && defined($opt_n)) {
die "ERROR: -g and -n can not be specified simultaneously. ".
"Choose either -g or -n";
} elsif (defined($opt_g)) {
$replaceChar = '-';
} elsif (defined($opt_n)) {
$replaceChar = $opt_n;
}
if (defined($opt_i)) {
my @result = Splice(@dat);
PrintFASTA(@result);
exit (0);
}
if (defined($opt_r)) {
if (defined($opt_s) || defined($opt_f) || defined($opt_c)) {
die "When -r is specified, -c, -f, or -s can't be used\n";
}
@index = RepeatIndex($maxLen, $opt_r);
} elsif (defined($opt_s)) {
if (defined($opt_f)) {
die "-s and -f can't be used at the same time\n";
}
@index = MkSelIndex($maxLen, $opt_s);
} elsif (defined($opt_f)) {
@index = MkIndexFromFile($maxLen, $opt_f);
} else {
@index = 0..($maxLen - 1);
}
if (defined($opt_d)) {
@allSites = 0..($maxLen - 1) ;
@index = InANotInB (\@allSites, \@index);
}
if (defined($opt_c)) {
@index = CodonToBaseIndex(@index);
}
# @dat = AdjustSeqLength(@dat);
@index = sort {$a <=> $b} (@index);
@index = ExtractUnique (@index);
@dat = Sites (\@dat, \@index);
if (defined ($opt_x)) {
@dat = RemoveGapOnlySites(\@dat, $opt_x);
}
PrintFASTA (@dat);
exit(0);
#### functions
sub RepeatIndex {
my ($max, $optionR) = @_;
$optionR =~ s/\s+//g;
if ($optionR !~ /^[123,]+$/) {
print STDERR "$usage";
die "\nERROR: give comma delimited integer (1, 2, or 3) as the " .
" argument of -r.\ne.g. -r '2,3'\n";
}
my @repList = split (",", $optionR);
my @result = ();
for my $i (1..$max) {
if (MemberQ(($i - 1) % 3 + 1, \@repList) == 1) {
push @result, $i-1;
}
}
return(@result);
}
sub MkSelIndex {
my ($max, $siteList) = @_;
$siteList =~ s/^\s+//;
$siteList =~ s/\s+$//;
my @sites = split(/\s*,\s*/, $siteList);
my @result = ();
foreach my $item (@sites) {
if ($item =~ /^(\d+)-(\d+)$/) {
die "ERROR: 1st number is larger than 2nd in $item\n" if ($1 > $2);
$beginPos = $1 - 1;
$endPos = $2 - 1;
} elsif ($item =~ /^-(\d+)$/) {
$beginPos = 0;
$endPos = $1 - 1;
} elsif ($item =~ /^(\d+)-$/) {
$beginPos = $1 - 1;
$endPos = $max-1;
} elsif ($item =~ /^(\d+)$/) {
$beginPos = $1 - 1;
$endPos = $1 - 1;
} else {
die "$siteList given as the list of sites. " .
"Make sure it is comma delimitted, and each element is " .
" one of the forms: 23-26, 29, -10, 40-\n";
}
push (@result, $beginPos..$endPos);
}
return(@result);
}
sub MkIndexFromFile {
my ($max, $file) = @_;
open (IN,"<$file") || die "Can't open the file $file\n";
my @result=();
while(<IN>) {
chomp();
s/#.*$//; # remove comments
s/^\s+//; s/\s+$//;
next if (/^$/);
s/,/\t/g; # convert commans to tab
s/\s+/\t/g;
unless(/^[\d\t-]+$/) {
warn "This line contains non-numeric, skipped:\n$_\n";
next;
}
my @line = split;
push @result, @line;
}
my $siteString = join ",", @result;
@result = MkSelIndex($max, $siteString);
# change unit offset index to 0-offset.
# for $i (0..$#result) {
# if ($max < $i) {
# warn "site $i is too large. should be less than $max\n";
# }
# $result[$i] -= 1;
# }
return @result;
}
# convert codon index (0 offset) to nucleotide index (0 offset)
sub CodonToBaseIndex {
my @list = @_;
my @result=();
for my $i (@list) {
push (@result, $i*3, $i*3+1, $i*3+2);
}
return (@result);
}
# takes an arg; name of a file from which data are read Then read in
# the data and make an array. Each element of this array corresponds
# to a sequence, name tab data.
sub ReadInFASTA {
my $infile = shift;
my @line;
my $i = -1;
my @result = ();
my @seqName = ();
my @seqDat = ();
open (INFILE, "<$infile") || die "Can't open $infile\n";
while (<INFILE>) {
chomp;
if (/^>/) { # name line in fasta format
$i++;
s/^>\s*//; s/^\s+//; s/\s+$//;
$seqName[$i] = $_;
$seqDat[$i] = "";
} else {
s/^\s+//; s/\s+$//;
s/\s+//g; # get rid of any spaces
next if (/^$/); # skip empty line
s/[uU]/T/g; # change U to T
$seqDat[$i] = $seqDat[$i] . uc($_);
}
# checking no occurence of internal separator $sep.
die ("ERROR: \"$sep\" is an internal separator. Line $. of " .
"the input FASTA file contains this charcter. Make sure this " .
"separator character is not used in your data file or modify " .
"variable \$sep in this script to some other character.\n")
if (/$sep/);
}
close(INFILE);
foreach my $i (0..$#seqName) {
$result[$i] = $seqName[$i] . $sep . $seqDat[$i];
}
return (@result);
}
sub GetSeqDat {
my @data = @_;
my @line;
my @result = ();
foreach my $i (@data) {
@line = split (/$sep/, $i);
push @result, $line[1];
}
return (@result)
}
sub GetSeqName {
my @data = @_;
my @line;
my @result = ();
foreach my $i (@data) {
@line = split (/$sep/, $i);
push @result, $line[0];
}
return (@result)
}
sub SelectSites {
my ($arrayRef, $indexRef) = @_;
unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){
die "args to SelectSites() should be ARRAY REF, ARRAY REF\n";
}
my $maxIndex = @$arrayRef -1;
my @result = ();
foreach my $posi (@$indexRef) {
if ($maxIndex < $posi) {
push @result, "?";
} else {
push @result, $$arrayRef[$posi];
}
}
return @result;
}
# 1st argument is a ref to an array with each element is a DNA sequence
# a ref to a vector of indices.
sub ReplaceOtherSitesWChar {
my ($arrayRef, $indexRef, $repChar) = @_;
unless (@_ == 3 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){
die "args to ReplaceOtherSitesWChar() should be ARRAY REF, ARRAY REF\n";
}
my $maxIndex = @$arrayRef -1;
my @allSites = 0..($maxIndex) ;
my @index = InANotInB (\@allSites, $indexRef); # making the complement set
warn "WARN: some selected sites don't exists\n"
if (Max(@$indexRef) > $maxIndex);
my @result = @$arrayRef;
foreach my $posi (@index) {
$result[$posi] = $repChar;
}
if ($debug) {
print join "", "debug: ", @$arrayRef, "\n";
print join "", "debug: ", @result, "\n\n";
}
return @result;
}
sub Sites {
my ($datRef, $indexRef) = @_;
my @seqDat = GetSeqDat(@$datRef);
my @seqName = GetSeqName(@$datRef);
my @result = ();
# make 2 dimensional matrix
foreach $seqNumber (0..$#seqDat) {
my @tmpArray = split(//, $seqDat[$seqNumber]);
my @thisSeq = (defined($opt_g) || defined($opt_n)) ?
ReplaceOtherSitesWChar(\@tmpArray, $indexRef, $replaceChar) :
SelectSites(\@tmpArray, $indexRef);
my $thisLine = $seqName[$seqNumber] . "\t" . (join("", @thisSeq));
push @result, $thisLine;
}
return (@result);
}
sub PrintFASTA {
my @seqName = GetSeqName(@_);
my @seqDat = GetSeqDat(@_);
for my $i (0..$#seqDat) {
# print ">$seqName[$i]\n$seqDat[$i]\n";
print ">$seqName[$i]\n";
my $seq = $seqDat[$i];
for (my $pos=0 ; $pos < length ($seq) ; $pos += $lineWidth) {
print substr($seq, $pos, $lineWidth), "\n";
}
}
}
sub MaxSeqLen {
my @data = GetSeqDat(@_);
my $maxLen = 0;
foreach $i (@data) {
my $len = CharLen($i);
$maxLen = $len if ($len > $maxLen);
}
return ($maxLen);
}
# take std seq data (name\tseq), and attach "?" for the shorter sequences
sub AdjustSeqLength {
my @data = @_;
my @seqDat = GetSeqDat(@_);
my @seqName = GetSeqName(@_);
my $maxLen = MaxSeqLen(@_);
foreach $i (0 .. $#seqDat) {
my $thisLen = CharLen ($seqDat[$i]);
if ($thisLen == $maxLen) {
; # do nothing
} elsif ($thisLen < $maxLen) {
my $diff = $maxLen - $thisLen;
warn "WARN: $seqName[$i] shorter. " .
"$diff '?' (missing character) were added at the end\n";
for ($j=0; $j < $diff; $j++) {
$data[$i] = $data[$i] . "?";
}
} else {
die "ERROR: the length of sequence $seqName[$i] is $thisLen, " .
"longer than \$maxLen = $maxLen. Weird!!";
}
}
return (@data);
}
sub RemoveGapOnlySites {
my ($seqDatARef, $multipleOf) = @_;
my @seqDat = GetSeqDat(@$seqDatARef);
my @seqName = GetSeqName(@$seqDatARef);
my $maxLen = MaxSeqLen(@$seqDatARef);
my @gapSites = ();
my @notGapSites = ();
my ($posi, $seqNumber);
my @seqMat = ();
# make 2 dimensional matrix
foreach $seqNumber (0..$#seqDat) {
my @tmpArray = split(//, $seqDat[$seqNumber]);
# Check the length
if (@tmpArray != $maxLen) {
die "ERROR: the sequence $seqName[$i] is not same length " .
"as \$maxLen = $maxLen. Weird!!";
}
push @seqMat, [ @tmpArray ];
}
# now identify the all gap sites
for $posi (0 .. ($maxLen-1)) {
my $gap = 1;
for $seqNumber (0 .. $#seqMat){
if ($seqMat[$seqNumber][$posi] !~ /^[-\?]$/) {
$gap = 0;
last;
}
}
if ($gap == 1) { # all sequences have a gap at these sites
push (@gapSites, $posi+1); # now unit-offset
} else { # there are some non-gap character at these sites
push (@notGapSites, $posi+1);
}
}
my @rmSites = (); # removing multiples of $multipleOf
for(my $i = 0; $i < @gapSites - $multipleOf + 1; $i++) {
my $rmFlag = 1;
for(my $j = 1; $j < $multipleOf; $j++) {
if ($gapSites[$i] + $j != $gapSites[$i+$j]) {
$rmFlag = 0; # we don't want to remove this $i
$j=$multipleOf; # get out of inner loop
}
}
if ($rmFlag == 1) {
push @rmSites, @gapSites[$i..($i+$multipleOf-1)];
$i += $multipleOf - 1;
}
}
my @allSites = 1..($maxLen) ;
my @selIndex = InANotInB (\@allSites, \@rmSites);
@selIndex = To0Offset(@selIndex); # convert to 0-ffset
# select sites and make results
my @result = ();
for $seqNumber (0 .. $#seqMat) {
my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex);
my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq));
push (@result, $line);
}
# if (@rmSites > 0) {
# warn ("Following sites consist of all gaps, removed from analysis\n");
# print STDERR join(" ", @rmSites);
# print STDERR "\n";
# }
return (@result);
}
# convert 1-offset index array to 0-offset array
sub To0Offset {
my @result = map {$_ - 1} @_;
return @result;
}
# count the number of characters in a string
sub CharLen {
my $string = shift;
my @charString = split (//, $string);
return scalar(@charString);
}
# this function take two scalars and return the larger value
sub larger {
my ($a, $b) = @_;
return (($a > $b) ? $a : $b);
}
sub InANotInB {
my ($aRef, $bRef) =@_;
my %seen = ();
my @aonly =();
foreach my $item (@$bRef) { $seen{$item} = 1};
foreach my $item (@$aRef) {
push (@aonly, $item) unless $seen{$item};
}
return (@aonly);
}
sub ExtractUnique {
my %seen=();
my @unique = ();
foreach my $item (@_) {
push (@unique, $item) unless $seen{$item}++;
}
return @unique;
}
sub Max {
my $max = shift;
foreach $item (@_) {
if ($item > $max) {
$max = $item;
}
}
return $max;
}
sub MemberQ {
my ($x, $arrRef) = @_;
foreach my $item (@$arrRef) {
if ($x eq $item) {
return 1;
}
}
return 0;
}
sub sortByColumn {
# numerical sort by a column, return an array
# sortbyColumn ($col_num, $order, @record)
# @record is an array with each element representing a space delimited record
# example
# ("473 p1 S 0:06 -bash", "541 p2 SW 0:00 ps-a", ....)
# $col_num -- the column by which the record is sorted by (left-most col is 0)
# $order can be "a" (ascending) or "d" (descending),
# sort column can be hyphnated numbers (e.g. 10-4-150)
local $col_num = shift(@_);
local $order = shift(@_);
local @record = @_ ;
local ($sortCol);
## test if the sort column is hyphnated or plain number
local $sortMethod = "number";
foreach $sortCol (@record) {
if ( (split(/\s+/,$sortCol))[$col_num] =~ /\d+-\d+/ ) {
$sortMethod = "hyphnated";
last ;
}
}
return sort $sortMethod @record;
## two sub-sub-routines
sub number {
# $col_num, $order are the given arguments
# the left-most column is 0
local $first = (split(/\s+/, $a))[$col_num];
local $second = (split(/\s+/, $b))[$col_num];
# argument errors not well trapped here
($first,$second) = ($second, $first) if ($order eq "d");
return ($first <=> $second);
}
#probably I don't need the "sub number"
sub hyphnated {
# $col_num, $order are the given arguments
local ($each_number, $cmp_result, @temp_swap);
## separte the hyphnated numbers and put them in the following arrays
local @first = split(/-/, ((split(/\s+/, $a))[$col_num]));
local @second = split(/-/, ((split(/\s+/, $b))[$col_num]));
## ascending (default) or descending order
if ($order eq "d") {
@temp_swap = @first;
@first = @second;
@second = @temp_swap;
}
## comparison of array elements
for ($each_number = 0; $each_number <=
(($#first < $#second) ? $#first : $#second) ; $each_number++) {
$cmp_result = ($first[$each_number] <=> $second[$each_number]);
last if ($cmp_result);
}
## if the size of two arrays differ
if ( ($cmp_result == 0) && ($#first != $#second) ) {
return (($#first < $#second) ? -1 : 1);
} else {
return $cmp_result;
}
}
}
#### not used
sub Bootstrap {
my @data = @_;
my @seqDat = GetSeqDat(@_);
my @seqName = GetSeqName(@_);
my $maxLen = MaxSeqLen(@data);
my ($tmpOutFile, $tmpSeqFileName);
# getting tmpfilenames
do { $tmpSeqFileName = tmpnam() }
until my $fh = IO::File->new($tmpSeqFileName, O_RDWR|O_CREAT|O_EXCL);
close $fh;
do { $tmpOutFile = tmpnam() }
until $fh = IO::File->new($tmpOutFile, O_RDWR|O_CREAT|O_EXCL);
close $fh;
print "$tmpSeqFileName\n$tmpOutFile\n";
# END { unlink($tmpSeqFileName)
# or die "Couldn't unlink $tmpSeqFileName : $!" }
# END { unlink($tmpOutFile) or die "Couldn't unlink $tmpOutFile : $!" }
# prepare PAUP cmd
if (defined($opt_p)) {
$setting = $opt_p;
} else {
$setting = "set criterion=distance; dset distance=k2p";
}
my $paupCmd = "execute $tmpSeqFileName; $setting; " .
"log start file=$tmpOutFile replace=yes ; showdist; " .
"log stop; quit WarnTSave=no;";
warn "PAUP commands:\n$paupCmd\n";
my @sampledDat = SampleSites($maxLen, @seqDat);
WriteNEXUS ($tmpSeqFileName, \@seqName, \@sampledDat);
open (PAUP, "|paup -n");
print PAUP $paupCmd;
close(PAUP);
if (defined ($opt_s)) {
open (GETDIST, "$EXTRACT_PAIR_DIST_EXE -s $opt_s $tmpOutFile|");
} else {
open (GETDIST, "$EXTRACT_PAIR_DIST_EXE $tmpOutFile|");
}
my @dist =();
while(<GETDIST>) {
my @line = split;
if ($. == 1) {
if ($line[$#line] ne "dist") {
warn "## WARN ## using the last column ($line[$#line]) of " .
"output from $EXTRACT_PAIR_DIST_EXE as the distance\n";
}
}
push @dist, $line[$#line];
}
# if (@dist != @seqName * (@seqName - 1) / 2) {
# warn "## DANGER ## PAUP didn't out put correct number of " .
# "pairwise dists\n"
# }
}
# set criterion=distance;dset distance=TamNei Rates=gamma Shape=2.3333 Pinvar=0.1300
#echo "execute ../$(IN_BASENAME).nx;set criterion=distance;dset distance=TamNei Rates=gamma Shape=2.3333 Pinvar=0.1300; log start file=$@.tmp; showdist; log stop;quit WarnTSave=no;" |paup
# $(EXTRACT_PAIR_DIST) -s seqNames $@.tmp > $@
# note this function take only @seqDat
sub SampleSites {
my $maxLen = shift;
my @seqDat = @_;
my @randSites = RandIntArray($maxLen, $maxLen-1);
for my $seqNumber (0 .. $#seqDat) {
my @line = split (//, $seqDat[$seqNumber]);
@line = SelectSites (\@line, \@randSites);
my $randomized = join ("", @line);
push @result, $randomized;
}
return (@result);
}
# rand integers between 0 and $max (both ends inclusive)
sub RandIntArray {
my ($size, $max) = @_;
my @result = ();
for my $i (0 .. $size - 1) {
push @result, int(rand ($max + 1)); # rand returns [0, $max + 1)
}
return (@result);
}
sub WriteNEXUS {
my ($fileName, $nameArrayRef, $datArrayRef) = @_;
my @nameArray = @$nameArrayRef;
my @datArray = @$datArrayRef;
die "Error in WriteNEXUS\n" if (@nameArray != @datArray);
my $numSeq = @nameArray;
my $seqLen = CharLen($datArray[0]);
my $type = "nucleotide";
if (defined ($opt_a)) {
$type = "aminoacid";
}
open (FP, ">$fileName") || die "Can't open a tmpFile $fileName";
print FP "#NEXUS\nBegin data;\n" .
" Dimensions ntax=$numSeq nchar=$seqLen;\n" .
" Format datatype=$type gap=- missing=? matchchar=.;\n" .
" Matrix\n";
for my $i (0 .. $numSeq - 1) {
print FP "\'$nameArray[$i]\' $datArray[$i]\n";
}
print FP " ;\nEnd;\n";
close(FP);
return (0);
}
### for splicing
sub Splice {
my @dat = @_;
my @seqDat = GetSeqDat(@dat);
my @seqName = GetSeqName(@dat);
open SPLICE, "<$opt_i" || die "Can't open $opt_i\n";
my %splice = ReadSplicingData(\*SPLICE);
close (SPLICE);
# for $k (keys (%splice)) { print "debug: $k => $splice{$k}\n"; };
my @result = ();
for my $i (0..$#seqName) {
my $name = $seqName[$i];
if (exists ($splice{$name})) {
my $len = CharLen($seqDat[$i]);
if (defined($opt_c)) {
$len = int (($len + 2) / 3);
}
my @index = MkSelIndex($len, $splice{$name});
if (defined($opt_d)) { # delete the selected sites
my @allSites = 0..($maxLen - 1) ;
@index = InANotInB (\@allSites, \@index);
}
my @tmpDat = ($dat[$i]);
@tmpDat = Sites(\@tmpDat, \@index);
push @result, @tmpDat[0];
} else {
if (! defined ($opt_e)) {
push @result, $dat[$i];
}
}
}
return(@result);
}
sub ReadSplicingData {
my $fh = shift;
my %splice = ();
while(<$fh>) {
chomp;
s/^\s+//; s/\s+$//;
s/#.*$//;
next if (/^$/);
my @line = split (/\t/);
$splice{$line[0]} = $line[1];
}
foreach my $key (keys(%splice)) {
print STDERR "INFO: $key => $splice{$key}\n";
}
return(%splice);
}