File Coverage

blib/lib/Bio/MUST/Core/Taxonomy.pm
Criterion Covered Total %
statement 123 749 16.4
branch 0 144 0.0
condition 0 73 0.0
subroutine 41 93 44.0
pod 22 22 100.0
total 186 1081 17.2


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::Taxonomy;
2             # ABSTRACT: NCBI Taxonomy one-stop shop
3             # CONTRIBUTOR: Loic MEUNIER <loic.meunier@doct.uliege.be>
4             # CONTRIBUTOR: Mick VAN VLIERBERGHE <mvanvlierberghe@doct.uliege.be>
5             $Bio::MUST::Core::Taxonomy::VERSION = '0.212670';
6 17     17   133 use Moose;
  17         44  
  17         133  
7 17     17   120326 use namespace::autoclean;
  17         47  
  17         152  
8              
9 17     17   1820 use autodie;
  17         43  
  17         135  
10 17     17   96134 use feature qw(say);
  17         46  
  17         1574  
11              
12 17     17   137 use Smart::Comments '###';
  17         47  
  17         189  
13              
14 17     17   744802 use MooseX::Storage;
  17         518838  
  17         79  
15             with Storage('io' => 'StorableFile');
16              
17 17     17   5903 use Moose::Util::TypeConstraints;
  17         43  
  17         177  
18              
19 17     17   52616 use Algorithm::NeedlemanWunsch;
  17         59363  
  17         670  
20 17     17   157 use Carp;
  17         43  
  17         1073  
21 17     17   124 use Const::Fast;
  17         44  
  17         173  
22 17     17   10287 use File::Find::Rule;
  17         144242  
  17         189  
23 17     17   1497 use IPC::System::Simple qw(system);
  17         48  
  17         1171  
24 17     17   122 use List::AllUtils 0.12 qw(first firstidx uniq each_array mesh count_by max_by);
  17         458  
  17         1288  
25 17     17   8285 use LWP::Simple qw(get getstore);
  17         1109978  
  17         192  
26 17     17   3371 use Path::Class qw(dir file);
  17         44  
  17         1085  
27 17     17   124 use POSIX;
  17         44  
  17         166  
28 17     17   33247 use Scalar::Util qw(looks_like_number);
  17         66  
  17         957  
29 17     17   140 use Try::Tiny;
  17         43  
  17         1014  
30 17     17   8658 use Try::Tiny::Warnings;
  17         12318  
  17         1024  
31 17     17   9883 use XML::Bare;
  17         135481  
  17         1026  
32              
33 17     17   8694 use Bio::LITE::Taxonomy::NCBI::Gi2taxid qw(new_dict);
  17         166013  
  17         1177  
34 17     17   169 use Bio::Phylo::IO qw(parse);
  17         48  
  17         911  
35              
36 17     17   127 use Bio::MUST::Core::Types;
  17         42  
  17         575  
37 17     17   113 use Bio::MUST::Core::Constants qw(:ncbi :files);
  17         40  
  17         3681  
38 17     17   139 use Bio::MUST::Core::Utils qw(change_suffix);
  17         36  
  17         885  
39 17     17   130 use aliased 'Bio::MUST::Core::SeqId';
  17         45  
  17         192  
40 17     17   4232 use aliased 'Bio::MUST::Core::IdList';
  17         45  
  17         79  
41 17     17   3462 use aliased 'Bio::MUST::Core::IdMapper';
  17         54  
  17         67  
42 17     17   3487 use aliased 'Bio::MUST::Core::Tree';
  17         42  
  17         74  
43 17     17   3376 use aliased 'Bio::MUST::Core::Taxonomy::MooseNCBI';
  17         47  
  17         86  
44 17     17   2451 use aliased 'Bio::MUST::Core::Taxonomy::Filter';
  17         51  
  17         117  
45 17     17   2283 use aliased 'Bio::MUST::Core::Taxonomy::Criterion';
  17         43  
  17         122  
46 17     17   2260 use aliased 'Bio::MUST::Core::Taxonomy::Category';
  17         45  
  17         114  
47 17     17   2123 use aliased 'Bio::MUST::Core::Taxonomy::Classifier';
  17         50  
  17         111  
48 17     17   2090 use aliased 'Bio::MUST::Core::Taxonomy::Labeler';
  17         49  
  17         84  
49 17     17   2139 use aliased 'Bio::MUST::Core::Taxonomy::ColorScheme';
  17         42  
  17         118  
50              
51              
52             # public path to NCBI Taxonomy dump directory
53             has 'tax_dir' => (
54             traits => ['DoNotSerialize'],
55             is => 'ro',
56             isa => 'Bio::MUST::Core::Types::Dir',
57             required => 1,
58             coerce => 1,
59             );
60              
61              
62             # Note: init_arg => undef had to be removed to allow proper serialization
63             # this is needed because MooseX::Storage has never accepted my patches
64             # see https://rt.cpan.org/Public/Bug/Display.html?id=65733
65              
66             has '_ncbi_tax' => (
67             is => 'ro',
68             isa => 'Bio::MUST::Core::Taxonomy::MooseNCBI',
69             lazy => 1,
70             builder => '_build_ncbi_tax',
71             handles => qr{get_\w+}xms, # expose Bio::LITE::Taxonomy accessor methods
72             );
73              
74             # Note: this is related to (yet different from)
75             has '_gi_mapper' => ( # the nearly homonymous 'gi_mapper' method
76             traits => ['DoNotSerialize'],
77             is => 'ro',
78             isa => 'Bio::LITE::Taxonomy::NCBI::Gi2taxid',
79             lazy => 1,
80             builder => '_build_gi_mapper',
81             handles => {
82             get_taxid_from_gi => 'get_taxid',
83             },
84             );
85              
86              
87             has '_is_deleted' => (
88             traits => ['Hash'],
89             is => 'ro',
90             isa => 'HashRef[Bool]',
91             lazy => 1,
92             builder => '_build_is_deleted',
93             handles => {
94             'is_deleted' => 'defined',
95             },
96             );
97              
98              
99             has '_' . $_ . '_for' => (
100             traits => ['Hash'],
101             is => 'ro',
102             isa => 'HashRef[Str]',
103             lazy => 1,
104             builder => '_build_' . $_ . '_for',
105             handles => {
106             'is_' . $_ => 'defined',
107             $_ . '_for' => 'get',
108             },
109             ) for qw(merged misleading);
110              
111              
112             has '_dupes_for' => (
113             traits => ['Hash'],
114             is => 'ro',
115             isa => 'HashRef[HashRef[Str]]',
116             lazy => 1,
117             builder => '_build_dupes_for',
118             handles => {
119             'is_dupe' => 'defined',
120             'dupes_for' => 'get',
121             },
122             );
123              
124             # TODO: change this name? to avoid confusion with othe uses of rank_for
125             has '_rank_for' => (
126             traits => ['Hash'],
127             is => 'ro',
128             isa => 'HashRef[Str]',
129             lazy => 1,
130             builder => '_build_rank_for',
131             handles => {
132             'rank_for' => 'get',
133             },
134             );
135              
136              
137             has '_strain_taxid_for' => (
138             traits => ['Hash'],
139             is => 'ro',
140             isa => 'HashRef[HashRef[Str]]',
141             lazy => 1,
142             builder => '_build_strain_taxid_for',
143             handles => {
144             'get_strain_taxid_for' => 'get', # TODO: rename?
145             },
146             );
147              
148              
149             has '_matcher' => (
150             traits => ['DoNotSerialize'],
151             is => 'ro',
152             isa => 'Algorithm::NeedlemanWunsch',
153             init_arg => undef,
154             lazy => 1,
155             builder => '_build_matcher',
156             handles => {
157             nw_score => 'align',
158             },
159             );
160              
161             ## no critic (ProhibitUnusedPrivateSubroutines)
162              
163             sub _build_ncbi_tax {
164 0     0     my $self = shift;
165              
166             #### in _build_ncbi_tax
167              
168 0           local $_ = q{}; # to avoid loosing $_ due to Bio::LITE... constructor
169              
170             # open pipes from (possibly) multiple names.dmp and nodes.dmp
171 0           my $names_txid = file($self->tax_dir, 'names*.dmp' );
172 0           my $names_gca = file($self->tax_dir, 'gca*names.dmp');
173 0           my $nodes_txid = file($self->tax_dir, 'nodes*.dmp' );
174 0           my $nodes_gca = file($self->tax_dir, 'gca*nodes.dmp');
175              
176             # Note: taxid files are loaded last to get precedence in get_taxid_from_name
177 0           my @names_files = ($names_gca, $names_txid);
178 0           my @nodes_files = ($nodes_txid, $nodes_gca);
179              
180             # Note: names.dmp files are reordered on-the-fly so as scientific names
181             # get precedence over other names (e.g., synonyms) in case of duplicates
182             # Note: this should not be useful anymore (due to _dupes_for attribute)
183 0           my $ol = q{'if (index($_, "scientific name") == -1) { print } else { push @sns, $_ } END{ print for @sns }'};
184 0           open my $names_fh, '-|', qq{perl -nle $ol @names_files 2> /dev/null};
185 0           open my $nodes_fh, '-|', qq{cat @nodes_files 2> /dev/null};
186              
187 0           return MooseNCBI->new(
188             names => $names_fh,
189             nodes => $nodes_fh,
190              
191             # allow most NCBI Taxonomy synonyms classes
192             # cut -f4 -d'|' names.dmp | sort | uniq -c
193             # last updated on Aug-18-2021
194             # 1582 acronym
195             # 576856 authority
196             # 228 blast name
197             # 14555 common name
198             # 52040 equivalent name
199             # 484 genbank acronym
200             # 29979 genbank common name
201             # 1096 genbank synonym
202             # 733 in-part
203             # 61755 includes
204             # 2354398 scientific name
205             # 199012 synonym
206             # 167825 type material
207              
208             synonyms => [
209             'synonym', 'genbank synonym',
210             'acronym', 'genbank acronym',
211             'anamorph', 'genbank anamorph', 'teleomorph', # now useless
212             'blast name', 'common name', 'genbank common name',
213             'equivalent name', 'includes',
214             # 'authority', 'in-part', 'type material',
215             ]
216             );
217             }
218              
219             sub _build_gi_mapper {
220 0     0     my $self = shift;
221              
222             #### in _build_gi_mapper
223              
224 0           local $_ = q{}; # to avoid loosing $_ due to Bio::LITE... constructor
225              
226 0           return Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(
227             dict => file($self->tax_dir, 'gi_taxid_nucl_prot.bin'),
228             save_mem => 1, # 0 does not work on my MacBook Air
229             );
230             }
231              
232             sub _build_is_deleted {
233 0     0     my $self = shift;
234              
235             #### in _build_is_deleted
236              
237 0           my %is_deleted;
238              
239 0           my $infile = file($self->tax_dir, 'delnodes.dmp');
240 0           open my $in, '<', $infile;
241              
242 0           while (my $line = <$in>) {
243 0           my ($taxon_id) = $line =~ m/^(\d+)/xms;
244 0           $is_deleted{$taxon_id} = 1;
245             }
246              
247 0           return \%is_deleted;
248             }
249              
250             sub _build_merged_for {
251 0     0     my $self = shift;
252              
253             #### in _build_merged_for
254              
255 0           my %merged_for;
256              
257 0           my $infile = file($self->tax_dir, 'merged.dmp');
258 0           open my $in, '<', $infile;
259              
260 0           while (my $line = <$in>) {
261 0           chomp $line;
262 0           my ($old_taxid, $new_taxid) = split /\s*\|\s*/xms, $line;
263 0           $merged_for{$old_taxid} = $new_taxid;
264             }
265              
266 0           return \%merged_for;
267             }
268              
269             sub _build_misleading_for {
270 0     0     my $self = shift;
271              
272             #### in _build_misleading_for
273              
274 0           my %misleading_for;
275              
276             # TODO: improve warning suppression
277             # cat: .../taxdump/misleading*.dmp: No such file or directory
278 0           my $misleadings = file($self->tax_dir, 'misleading*.dmp');
279 0           open my $in, '-|', "cat $misleadings 2> /dev/null";
280              
281 0           while (my $line = <$in>) {
282 0           chomp $line;
283 0           my ($taxid, $full_org) = split /\s*\|\s*/xms, $line;
284 0           $misleading_for{$full_org} = $taxid;
285             }
286              
287 0           return \%misleading_for;
288             }
289              
290             sub _build_dupes_for {
291 0     0     my $self = shift;
292              
293             #### in _build_dupes_for
294              
295 0           my %taxids_for;
296              
297 0           my $infile = file($self->tax_dir, 'names.dmp');
298 0           open my $in, '<', $infile;
299              
300             # build hash of NCBI names => taxon_id(s) for all taxa
301              
302             LINE:
303 0           while (my $line = <$in>) {
304              
305             # focus on scientific names
306 0 0         next LINE unless $line =~ m/scientific \s name/xms;
307              
308             # Note: old code was tolerant to "minor" duplications:
309             # do not consider as duplicates genus and subgenus that are the same
310             # similarly ignore duplicates involving phylum vs other levels
311             # phyla come after classes and synonyms (and thus should win)
312             # next LINE if $line =~ m/genus>/xms || $line =~ m/<phylum>/xms;
313             # next LINE if $line =~ m/<actinobacteria>/xms; # workaround...
314              
315             # extract taxon and track taxon_id
316 0           chomp $line;
317 0           my ($taxon_id, $taxon) = (split /\s*\|\s*/xms, $line)[0..1];
318 0           push @{ $taxids_for{$taxon} }, $taxon_id;
  0            
319             }
320              
321 0           my %dupes_for;
322              
323             # create helper Taxonomy object
324 0           my $tax = $self->new( tax_dir => $self->tax_dir );
325              
326             TAXON:
327 0           while (my ($taxon, $taxids) = each %taxids_for) {
328              
329             # only proceed for duplicate taxa
330 0 0         next TAXON if @{$taxids} < 2;
  0            
331              
332             # build hash of partial lineage => taxon_id only for duplicate taxa
333 17     17   22998 no warnings 'uninitialized'; # avoid undef due to 2-4-level lineages
  17         51  
  17         2170  
334             my %taxid_for = map {
335 0           ( join q{; }, ( $tax->get_taxonomy($_) )[-5..-1] ) => $_
336 0           } @{$taxids};
  0            
337 17     17   155 use warnings;
  17         45  
  17         42014  
338              
339             # warn of taxa that end by the same three last taxa
340             # Note: this looks like a NCBI bug a couple of scientific names
341             carp "[BMC] Note: $taxon cannot be disambiguated."
342 0 0         . ' This should not be an issue.' if keys %taxid_for < @{$taxids};
  0            
343              
344 0           $dupes_for{$taxon} = \%taxid_for;
345             }
346              
347 0           return \%dupes_for;
348             }
349              
350             # Note: taken from nodes.dmp as follows (then manually re-ordered)
351             # cut -f3 -d'|' nodes.dmp | sort | uniq
352             # last updated on Aug-18-2021
353             const my @LEVELS => (
354             'skip', # default collapsing level
355             'top', # useful to preserve 'cellular organisms'
356             'superkingdom', 'kingdom', 'subkingdom',
357             'superphylum', 'phylum', 'subphylum',
358             'superclass', 'class', 'subclass', 'infraclass',
359             'cohort', 'subcohort',
360             'superorder', 'order', 'suborder', 'infraorder', 'parvorder',
361             'superfamily', 'family', 'subfamily',
362             'tribe', 'subtribe',
363             'genus', 'subgenus',
364             'section', 'subsection',
365             'series',
366             'species group', 'species subgroup', 'species', 'subspecies',
367              
368             'subvariety', 'varietas', # relative order of all these terms is unclear
369             'morph', 'forma', 'forma specialis',
370             'isolate', 'strain',
371             'pathogroup', 'serogroup', 'serotype',
372             'biotype', 'genotype',
373              
374             'clade', 'no rank', # both terms can appear anywhere in a lineage
375             );
376              
377             sub _build_rank_for {
378             #### in _build_rank_for
379 0     0     my $i = 1;
380 0           return { map { $_ => $i-- } @LEVELS };
  0            
381             }
382              
383             sub _build_strain_taxid_for {
384 0     0     my $self = shift;
385              
386             #### in _build_strain_taxid_for
387              
388             # open pipe from (possibly) multiple names.dmp
389             # TODO: try to avoid code duplication with _build_ncbi_tax
390 0           my $names_txid = file($self->tax_dir, 'names*.dmp' );
391 0           my $names_gca = file($self->tax_dir, 'gca*names.dmp');
392              
393             # Note: taxid files are loaded last to get precedence over gca numbers
394 0           my @names_files = ($names_gca, $names_txid);
395 0           open my $names_fh, '-|', "cat @names_files";
396              
397 0           my %strain_taxid_for;
398              
399             LINE:
400 0           while (my $line = <$names_fh>) {
401 0           chomp $line;
402              
403             # fetch taxid and organism name
404 0           my ($taxid, $org) = split /\s*\|\s*/xms, $line;
405              
406             # extract components from organism name
407 0           my ($genus, $species, $strain) = SeqId->parse_ncbi_name($org);
408              
409             # skip entries without strain
410 0 0         next LINE unless $strain;
411              
412             # store tax_id for org and strain combo
413 0           my $org_key = _make_hashkey($genus, $species);
414 0           my $strain_key = _clean_strain($strain);
415 0 0 0       $strain_taxid_for{$org_key}{$strain_key} = $taxid # avoid bug with
416             if defined $strain_key and length $strain_key; # NW aligner
417             }
418              
419 0           return \%strain_taxid_for;
420             }
421              
422             sub _clean_strain {
423 0     0     my $strain = shift;
424              
425             # remove unwanted prefices and characters (if any)
426 0           $strain =~ s{\b substr \b}{}xmsgi;
427 0           $strain =~ s{\b strain \b}{}xmsgi;
428 0           $strain =~ s{\b subsp \b}{}xmsgi;
429 0           $strain =~ s{\b str \b}{}xmsgi;
430 0           $strain =~ tr/A-Za-z0-9//cd; # delete non-alphanumeric chars
431              
432 0           return lc $strain;
433             }
434              
435             sub _make_hashkey {
436 0     0     my $genus = shift;
437 0           my $species = shift;
438              
439 0           $species =~ tr/-//d; # clean hyphens in species
440 0           my $org = lc( $genus . q{ } . $species );
441              
442 0           return $org;
443             }
444              
445              
446             const my $MATCHYOU => 3; # cannot use $MATCH as it is reserved
447             const my $MISMATCH => -1;
448             const my $GAP_OPEN => -2;
449             const my $GAP_EXTEND => 0;
450              
451             sub _build_matcher {
452 0     0     my $self = shift;
453              
454             #### in _build_matcher
455              
456 0           local $_ = q{}; # to avoid loosing $_ due to Algorithm::... constructor
457              
458             # setup Algorithm::NeedlemanWunsch object
459 0           my $matcher = Algorithm::NeedlemanWunsch->new( \&_matcher_matrix );
460 0           $matcher->gap_open_penalty( $GAP_OPEN );
461 0           $matcher->gap_extend_penalty($GAP_EXTEND);
462              
463 0           return $matcher;
464             }
465              
466             const my $GAP_CODE => '*';
467             const my $MATCH_CODE => '.';
468             const my $MISMATCH_CODE => ':';
469              
470             const my %SCORE_FOR => (
471             $GAP_CODE => $GAP_OPEN,
472             $MATCH_CODE => $MATCHYOU,
473             $MISMATCH_CODE => $MISMATCH,
474             );
475              
476             sub _matcher_matrix { ## no critic (RequireArgUnpacking)
477 0     0     return $SCORE_FOR{ _match_code(@_) };
478             }
479              
480             sub _match_code { ## no critic (RequireArgUnpacking)
481 0 0   0     return $GAP_CODE unless @_;
482              
483             # if one is not \w+ and the other is \w+ you want it to be a gap, not a
484             # mismatch --> higher penalty
485 0 0 0       if ( $_[0] =~ m/ [^A-Za-z0-9] /xms && $_[1] =~ m/ [A-Za-z0-9] /xms
      0        
      0        
486             or $_[0] =~ m/ [A-Za-z0-9] /xms && $_[1] =~ m/ [^A-Za-z0-9] /xms ) {
487 0           return $GAP_CODE;
488             }
489              
490 0 0 0       if ( $_[0] =~ m/ [A-Za-z0-9] /xms || $_[1] =~ m/ [A-Za-z0-9] /xms ) {
491 0 0         return ( lc $_[0] eq lc $_[1] ) ? $MATCH_CODE : $MISMATCH_CODE;
492             }
493              
494 0           return $MATCH_CODE;
495             }
496              
497             ## use critic
498              
499              
500             # additional accessor methods (w.r.t. Bio::LITE::Taxonomy)
501             # Since Bio::LITE::Taxonomy methods return arrays (and not ArrayRefs), these
502             # methods also use arrays as in/out data structures (for consistency).
503             # Not quite... We use wantarray when needed now.
504              
505             around qr{ _from_seq_id \z | _from_legacy_seq_id \z }xms => sub {
506             my $method = shift;
507             my $self = shift;
508             my $seq_id = shift;
509              
510             # coerce plain strings to SeqId...
511             # ... but do not touch (stringified) lineages (and SeqIds)
512             match_on_type $seq_id => (
513             'Bio::MUST::Core::Types::Lineage' => sub { },
514             'Str' => sub {
515             $seq_id = SeqId->new( full_id => $seq_id )
516             },
517             => sub { },
518             );
519              
520             return $self->$method($seq_id, @_);
521             };
522              
523             around qr{ _from_name \z }xms => sub {
524             my $method = shift;
525             my $self = shift;
526             my $name = shift;
527              
528             # avoid issue with undefined names
529             return undef unless $name; ## no critic (ProhibitExplicitReturnUndef)
530              
531             if ( $self->is_dupe($name) ) {
532             carp "[BMC] Warning: $name is taxonomically ambiguous;"
533             . ' returning undef!';
534             return undef; ## no critic (ProhibitExplicitReturnUndef)
535             }
536              
537             return $self->$method($name);
538             };
539              
540             around qw( get_taxonomy get_taxonomy_with_levels get_term_at_level ) => sub {
541             my $method = shift;
542             my $self = shift;
543             my $taxon_id = shift;
544              
545             # update taxon_id if merged in current version of NCBI Taxonomy
546             # in contrast, we don't do anything if taxon_id has been deleted
547             if ( defined $taxon_id && $self->is_merged($taxon_id) ) {
548             my $msg = "[BMC] Warning: merged taxid for $taxon_id;";
549             $taxon_id = $self->merged_for($taxon_id);
550             carp "$msg using $taxon_id instead!";
551             }
552              
553             return $self->$method($taxon_id, @_);
554             };
555              
556              
557             sub get_taxid_from_seq_id {
558             my $self = shift;
559             my $seq_id = shift;
560              
561             # 0. return undef in case of tagged contaminant sequences
562             # TODO: return taxonomy of contaminant 'tail' if any?
563             return undef ## no critic (ProhibitExplicitReturnUndef)
564             if $seq_id->is_doubtful;
565              
566             # 1. first handles valid MUST strain names that look like taxon_ids
567             # Note1: 'valid' ids are the opposite to 'foreign_ids'
568             # Note2: such cases are keyed by their full_org names
569             unless ( $seq_id->is_foreign ) {
570             my $taxon_id = $self->misleading_for( $seq_id->full_org );
571             return $taxon_id if $taxon_id;
572             }
573              
574             # 2. default path is to use already parsed taxon_id
575             # this applies to modern MUST SeqIds and taxonomy-aware abbr ids
576             return $seq_id->taxon_id
577             if $seq_id->taxon_id;
578              
579             # 3. handles foreign_ids:
580             # ... if a GI number is available then use the GI-to-taxid mapper
581             # ... otherwise use full_id as a literal NCBI Taxonomy name
582             if ( $seq_id->is_foreign ) {
583             return $self->get_taxid_from_gi( $seq_id->gi )
584             if $seq_id->gi;
585             return $self->get_taxid_from_name( $seq_id->full_id )
586             }
587              
588             # back to valid MUST ids
589             # idea: use as much information as possible but in case of failure...
590             # ... retry after droppping the most specific piece (greedy behavior)
591              
592             # 4. tries to recover MUST legacy strain in NCBI Taxonomy...
593             if ( $seq_id->strain ) {
594             my $taxon_id = $self->get_taxid_from_legacy_seq_id($seq_id);
595             return $taxon_id if $taxon_id;
596             }
597              
598             # 5. tries to recover organism binomial in NCBI Taxonomy...
599             unless ( $seq_id->is_genus_only ) {
600             my $taxon_id = $self->get_taxid_from_name( $seq_id->org );
601             return $taxon_id if $taxon_id;
602             }
603              
604             # 6. tries to recover organism genus in NCBI Taxonomy
605             # this may fail if genus is ambiguous (see around method modifier)
606             return $self->get_taxid_from_name( $seq_id->genus );
607             }
608              
609              
610             sub get_taxid_from_legacy_seq_id {
611             my $self = shift;
612             my $seq_id = shift;
613              
614             # fetch organism name
615             my $org_key = _make_hashkey( $seq_id->genus, $seq_id->species );
616              
617             # fetch and clean organism (query) strain
618             my $query_strain = $seq_id->strain;
619             return undef ## no critic (ProhibitExplicitReturnUndef)
620             unless $query_strain;
621             $query_strain = lc SeqId->clean_strain($query_strain);
622              
623             # fetch hash containing 'strain => taxid' pairs
624             # original clean code using native delegation:
625             # my $taxid_for = $self->get_strain_taxid_for($org_key);
626             # new hackish code for performance only:
627             my $taxid_for = $self->_strain_taxid_for->{$org_key};
628             return undef ## no critic (ProhibitExplicitReturnUndef)
629             unless $taxid_for;
630              
631             # check if strain exist 'as is'
632             my $taxid = $taxid_for->{$query_strain};
633             return $taxid
634             if $taxid;
635              
636             # fetch subject strains from hash in lexical order
637             my @sbjct_strains = sort { $a cmp $b } keys %{$taxid_for};
638              
639             # split query and subject strains
640             my @split_query_strain = split //xms, $query_strain;
641             my @split_sbjct_strains = map { [ split //xms ] } @sbjct_strains;
642              
643             # get score for each alignment through _matcher attribute
644             my @scores = map {
645             $self->nw_score( \@split_query_strain, $_ )
646             } @split_sbjct_strains;
647              
648             # get strain with max_score
649             my $max_score = List::AllUtils::max @scores;
650             my $max_index = firstidx { $_ == $max_score } @scores;
651             my $max_strain = $sbjct_strains[$max_index];
652              
653             # set heuristic threshold score based on shortest string len
654             my $threshold = List::AllUtils::min(
655             length $query_strain, length $max_strain
656             ) * $MATCHYOU + $GAP_OPEN;
657              
658             # return strain taxid if above threshold
659             return undef ## no critic (ProhibitExplicitReturnUndef)
660             if $max_score < $threshold;
661             return $taxid_for->{$max_strain};
662             }
663              
664              
665             sub get_taxonomy_from_seq_id {
666             my $self = shift;
667             my $seq_id = shift;
668              
669             return match_on_type $seq_id => (
670              
671             # regular case: $seq_id is a Bio::MUST::Core::SeqId
672             'Bio::MUST::Core::SeqId' => sub {
673             $self->_taxonomy_from_seq_id_(0, $seq_id);
674             },
675              
676             # optimizations for 'seq_ids' already being lineages (e.g., LCAs)
677             # examine context for returning plain array or ArrayRef
678              
679             # case 1: stringified lineage that must be split on semicolons
680             'Bio::MUST::Core::Types::Lineage' => sub {
681             my @taxonomy = split qr{;\ *}xms, $seq_id; # q{;} or q{; }
682             wantarray ? @taxonomy : \@taxonomy;
683             },
684              
685             # case 2: ArrayRef that just must be dereferenced
686             ArrayRef => sub {
687             wantarray ? @{$seq_id} : $seq_id
688             },
689             );
690             }
691              
692              
693             sub fetch_lineage { ## no critic (RequireArgUnpacking)
694 0     0 1   return shift->get_taxonomy_from_seq_id(@_);
695             }
696              
697              
698             sub get_taxid_from_taxonomy { ## no critic (RequireArgUnpacking)
699 0     0 1   my $self = shift;
700              
701             # fetch taxonomy from args using match_on_type strategy (see above)
702             # Note: this could also work from a seq_id but (this would be inefficient)
703 0           my @taxonomy = $self->get_taxonomy_from_seq_id(@_);
704              
705 0           while (@taxonomy) {
706              
707             # first try to get taxon_id from last taxon
708 0           my $taxon = $taxonomy[-1];
709 0           my $taxon_id = $self->get_taxid_from_name($taxon);
710 0 0         return $taxon_id if $taxon_id;
711              
712             # then try to disambiguate duplicate taxa
713 0           my $dupes_for = $self->dupes_for($taxon);
714 17     17   174 no warnings 'uninitialized'; # avoid undef due to 2-4-level lineages
  17         51  
  17         1256  
715 0           $taxon_id = $dupes_for->{ join q{; }, @taxonomy[-5..-1] };
716 17     17   141 use warnings;
  17         70  
  17         131608  
717 0 0         if ($taxon_id) {
718 0           carp "[BMC] Note: managed to disambiguate $taxon based on lineage!";
719 0           return $taxon_id;
720             }
721              
722             # finally retry with previous (= higher) taxon
723             # Note: this should never be used for NCBI lineages
724             # but for, e.g., SILVA lineages where (triplet) taxa may be different
725 0           pop @taxonomy;
726 0 0         carp "[BMC] Note: trying to identify $taxon by following lineage..."
727             if @taxonomy;
728             }
729              
730 0           return undef; ## no critic (ProhibitExplicitReturnUndef)
731             }
732              
733              
734             sub get_taxonomy_with_levels_from_seq_id { ## no critic (RequireArgUnpacking)
735             return shift->_taxonomy_from_seq_id_(1, @_);
736             }
737              
738              
739             sub _taxonomy_from_seq_id_ {
740 0     0     my $self = shift;
741 0           my $levels = shift;
742 0           my $seq_id = shift;
743              
744             # try to fetch taxon_id and corresponding taxonomy
745 0           my $taxon_id = $self->get_taxid_from_seq_id($seq_id);
746 0 0         my @taxonomy = $levels ? $self->get_taxonomy_with_levels($taxon_id)
747             : $self->get_taxonomy($taxon_id);
748              
749             # workaround for 'return undef' constructs in Bio::LITE::Taxonomy
750             # ... to ensure that no (undef) list is returned by our additional methods
751             # Note: our policy is thus unlike get_taxonomy and get_taxid_from_name
752 0 0         @taxonomy = () unless $taxonomy[0];
753 0 0 0       carp '[BMC] Warning: cannot fetch tax for ' . ( $seq_id->full_id || q{''} )
754             . '; ignoring it!' unless @taxonomy;
755              
756             # examine context for returning plain array or ArrayRef
757 0 0         return wantarray ? @taxonomy : \@taxonomy;
758             }
759              
760              
761             sub get_taxa_from_taxid { ## no critic (RequireArgUnpacking)
762 0     0 1   my $self = shift;
763 0           my $taxon_id = shift;
764              
765             # TODO: consider numbered levels as in fetch-tax.pl?
766 0           my @taxa = map { $self->get_term_at_level($taxon_id, $_) } @_;
  0            
767 0 0         return wantarray ? @taxa : \@taxa; # specify level through currying
768             }
769              
770              
771             sub get_nexus_label_from_seq_id {
772             my $self = shift;
773             my $seq_id = shift;
774             my $args = shift // {}; # HashRef (should not be empty...)
775              
776             # try to fetch organism and accession (or GI number as a fallback)
777             # TODO: allow for choosing between acc/gi when both are available
778             my @lineage = $self->get_taxonomy_from_seq_id($seq_id);
779             my $org = pop @lineage
780             // $seq_id->full_org( q{ } ); # use space before strain
781             my $acc = $args->{append_acc} ? $seq_id->accession // $seq_id->gi : undef;
782              
783             # build label according to available pieces
784             # Note: fallback to full_id for foreign ids (undefined org)
785             my $label = defined $org && defined $acc ? "$org [$acc]"
786             : defined $org ? $org
787             : $seq_id->full_id
788             ;
789              
790             return SeqId->new(full_id => $label)->nexus_id;
791             }
792              
793              
794             # clade analysis methods
795              
796              
797             sub get_common_taxonomy_from_seq_ids { ## no critic (RequireArgUnpacking)
798 0     0 1   my $self = shift;
799 0           my @seq_ids = @_;
800              
801             # setup optional threshold (if any) to first argument
802             # Note: this approach is used to preserved backwards compatibility
803             # Note: even if SeqId->full_id is a number the test works as expected
804 0 0         my $threshold = looks_like_number $seq_ids[0] ? shift @seq_ids : 1.0;
805              
806             # fetch lineages for all seq ids
807             my @lineages = map {
808 0           scalar $self->get_taxonomy_from_seq_id($_) # note the scalar context
  0            
809             } @seq_ids;
810              
811             # compute common lineage
812 0           return $self->_common_taxonomy($threshold, @lineages);
813             }
814              
815              
816             sub compute_lca { ## no critic (RequireArgUnpacking)
817 0     0 1   return shift->get_common_taxonomy_from_seq_ids(@_);
818             }
819              
820             sub _common_taxonomy { ## no critic (RequireArgUnpacking)
821 0     0     my $self = shift;
822 0           my $threshold = shift;
823 0           my @lineages = @_;
824              
825             # ignore missing lineages
826             # TODO: decide whether missing lineages should be taken into account
827 0 0         @lineages = grep { @{$_} ? $_ : () } @lineages;
  0            
  0            
828              
829 0           my @common_lineage;
830              
831             # original version: strict consensus
832             # Note the use of a dynamically-sized multiple ArrayRef iterator
833             # to walk down all lineages in parallel
834             # my $ea = each_arrayref(@lineages);
835             #
836             # TAXON:
837             # while ( my @taxa = $ea->() ) {
838             # no warnings 'uninitialized'; # avoid undef due to longer lineages
839             # last TAXON if uniq(@taxa) > 1;
840             # push @common_lineage, shift @taxa;
841             # }
842              
843             # current version: threshold-based majority-rule consensus
844             # algorithm: at each taxonomic rank count all seen taxa
845             # if the most popular taxon is seen > threshold (w.r.t. all lineages)
846             # then continue with the lineages featuring it
847             # otherwise stop at previous taxon
848 0           my $n = @lineages;
849              
850             TAXON:
851 0           for (my $i = 0; $n; $i++) {
852 0   0 0     my %count_for = count_by { $_->[$i] // q{} } @lineages;
  0            
853 0     0     my $taxon = max_by { $count_for{$_} } keys %count_for;
  0            
854 0 0         last TAXON unless $taxon;
855 0           my $taxon_n = $count_for{$taxon};
856 0 0         last TAXON if $taxon_n / $n < $threshold;
857 0           push @common_lineage, $taxon;
858 0   0       @lineages = grep { ( $_->[$i] // q{} ) eq $taxon } @lineages;
  0            
859             }
860              
861             # examine context for returning plain array or ArrayRef
862 0 0         return wantarray ? @common_lineage : \@common_lineage;
863             }
864              
865             # tree annotation methods
866              
867              
868             sub attach_taxonomies_to_terminals {
869 0     0 1   my $self = shift;
870 0           my $tree = shift;
871              
872             # store tip taxonomies in Bio::Phylo::Forest::Node generic attributes
873 0           for my $tip ( @{ $tree->tree->get_terminals } ) {
  0            
874              
875             # fetch taxonomy (and level list) from tip's seq id
876 0           my @tax = $self->get_taxonomy_with_levels_from_seq_id($tip->get_name);
877              
878             # attach them as distinct ArrayRefs
879 0           $tip->set_generic('taxonomy' => [ map { $_->[0] } @tax ] );
  0            
880 0           $tip->set_generic('levels' => [ map { $_->[1] } @tax ] );
  0            
881              
882             # Note: levels are needed for robust clade naming and collapsing.
883             # Indeed, get_level_from_name() can return incorrect level when a name
884             # exists at more than one taxonomic level in NCBI Taxonomy (e.g.,
885             # Bacteria, which is also an insect genus).
886             }
887              
888 0           return;
889             }
890              
891              
892              
893             sub attach_taxonomies_to_internals {
894 0     0 1   my $self = shift;
895 0           my $tree = shift;
896              
897             # post-order tree traversal
898             $tree->tree->visit_depth_first(
899              
900             # infer common taxonomy from direct children of each node
901             -post => sub {
902 0     0     my $node = shift;
903 0 0         return if $node->is_terminal;
904              
905             # collect children lineages
906             # ... and track the longest (better to be safe...) level list
907 0           my @lineages;
908             my @max_levels;
909 0           for (my $i = 0; my $child = $node->get_child($i); $i++) {
910 0           push @lineages, $child->get_generic('taxonomy');
911 0           my @levels = @{ $child->get_generic('levels') };
  0            
912 0 0         @max_levels = @levels if @levels > @max_levels;
913             }
914              
915             # store into current node their common lineage
916             # ... and cut down level list to match this common lineage
917 0           my @common_lineage = $self->_common_taxonomy(1.0, @lineages);
918 0           $node->set_generic('taxonomy' => \@common_lineage);
919 0           $node->set_generic(
920             'levels' => [ @max_levels[0..$#common_lineage] ]
921             );
922              
923 0           return;
924             },
925 0           );
926              
927 0           return;
928             }
929              
930              
931              
932             sub attach_taxa_to_entities {
933 0     0 1   my $self = shift;
934 0           my $tree = shift;
935 0   0       my $args = shift // {}; # HashRef (should not be empty...)
936              
937             # setup minimal taxonomic level(s) to reach
938             # Note 1: only named levels can be selected (see list above)
939             # Note 2: thus, 'no rank' results in the lowest possible taxon
940             # Note 3: default values are provided
941 0           my %arg_levels = ( name => 'no rank', collapse => 'skip', %{$args} );
  0            
942             my %target_for = map {
943 0           $_ => $self->rank_for( $arg_levels{$_} )
  0            
944             } keys %arg_levels;
945              
946             # ensure meaningful subtree collapsing WRT naming
947 0 0         if ($target_for{collapse} < $target_for{name}) {
948 0           carp '[BMC] Warning: collapsing level must include naming level;'
949             . ' upgrading!';
950 0           $target_for{collapse} = $target_for{name};
951             }
952              
953             # update both terminal and internal nodes
954             # attach lowest taxon with level higher or equal to target level(s)
955              
956             NODE:
957 0           for my $node ( @{ $tree->tree->get_entities } ) {
  0            
958              
959             # reset taxon for robustness
960 0           my $is_named;
961 0           $node->set_generic('taxon' => undef);
962 0           $node->set_generic('taxon_collapse' => undef);
963              
964             # walk up node lineage
965 0           my @lineage = @{ $node->get_generic('taxonomy') };
  0            
966 0           my @levels = @{ $node->get_generic('levels') };
  0            
967 0           while (my $taxon = pop @lineage) {
968              
969             # OLD VERSION DERIVING LEVEL FROM TAXON NAME
970             # ... if taxon was the highest in the lineage then rank it at top
971             # Note: this is needed to preserve 'cellular organisms'
972             # my $rank = @lineage == 0 ? 0 # 0 means 'top rank'
973             # : $self->rank_for( $self->get_level_from_name($taxon) );
974              
975             # get rank for taxon
976 0           my $rank = $self->rank_for( pop @levels );
977              
978             # name subtree at target level or above
979 0 0 0       if (!$is_named && $rank >= $target_for{name}) {
980 0           $node->set_generic('taxon' => $taxon);
981 0           $is_named = 1; # to get the lowest possible taxon
982             } # ... while keeping walking up to collapse at a higher level
983              
984             # collapse subtree only at target level
985 0 0         if ($rank == $target_for{collapse}) {
986 0           $node->set_generic('taxon_collapse' => $taxon);
987 0           next NODE; # can stop walking up now!
988             }
989             }
990             }
991              
992 0           return;
993             }
994              
995              
996             # Listable IdList and IdMapper factory methods
997              
998             # TODO: decide on whether this kind of check is needed and where...
999              
1000             # around qr{ \A taxonomic_ }xms => sub {
1001             # my $method = shift;
1002             # my $self = shift;
1003             # my $listable = shift;
1004             #
1005             # # ensure the object is listable (we could also test for the role)
1006             # unless ( $listable->can('all_seq_ids') ) {
1007             # carp 'Cannot build list/mapper from ' . ref($listable) . '; aborting!';
1008             # return;
1009             # }
1010             #
1011             # return $self->$method($listable, @_);
1012             # };
1013              
1014             # TODO: remove support for GI numbers
1015              
1016              
1017             sub gi_mapper {
1018 0     0 1   my $self = shift;
1019 0           my $listable = shift;
1020              
1021             return IdMapper->new(
1022             long_ids => [ $self->_taxids_from_gis($listable) ],
1023 0           abbr_ids => [ map { $_->full_id } $listable->all_seq_ids ],
  0            
1024             );
1025             }
1026              
1027             const my $BATCH_SIZE => 252;
1028             const my $GOLDEN_RATIO => 1.618;
1029             const my $MAX_ATTEMPT => 3;
1030              
1031             sub _taxids_from_gis {
1032 0     0     my $self = shift;
1033 0           my $listable = shift;
1034              
1035             # get full_ids and GI numbers
1036 0           my @seq_ids = $listable->all_seq_ids;
1037 0           my @full_ids = map { $_->full_id } @seq_ids;
  0            
1038 0           my @gis = map { $_->gi } @seq_ids;
  0            
1039 0           ### GIs to process: scalar @gis
  0            
1040 0           my @uniq_gis = uniq @gis;
1041 0           ### Unique GIs to process: scalar @uniq_gis
  0            
1042              
1043             # try to associate taxon_ids to GIs using binary GI-to-taxid mapper
1044 0           my %taxid_for;
1045             try {
1046 0     0     %taxid_for = map { $_ => $self->get_taxid_from_gi($_) } @uniq_gis;
  0            
1047             }
1048             catch {
1049 0     0     ### Note: 'Cannot load binary GI-to-taxid mapper'
  0            
1050 0           };
1051              
1052             # collect missing taxon_ids
1053 0           my @miss_gis = grep { not $taxid_for{$_} } @uniq_gis;
  0            
1054              
1055 0 0         if (@miss_gis) {
1056 0           ### Unique GIs missing from binary GI-to-taxid mapper: scalar @miss_gis
  0            
1057 0           ### Fetching corresponding taxon_ids online instead...
  0            
1058             }
1059              
1060 0           my %miss_taxid_for;
1061 0           my $batch_size = $BATCH_SIZE;
1062 0           my $attempt = 1;
1063              
1064 0           ESUMMARY:
1065 0           while (@miss_gis) { ### Iterating on missing GIs |===[%] |
  0            
1066 0           my $total = @miss_gis;
1067 0           $batch_size = List::AllUtils::min($batch_size, $total);
1068             #### $total
1069             #### $batch_size
1070             #### $attempt
1071              
1072             BATCH:
1073 0           for (my $i = 0; $i < $total; $i += $batch_size) {
1074 0           my $end = List::AllUtils::min($i + $batch_size, $#miss_gis);
1075             #### $i
1076             #### $end
1077              
1078             # fetch eSummaries for current batch of GI numbers
1079 0           my $report = get(
1080             'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'
1081             . 'db=protein&id=' . join ',', @miss_gis[$i..$end]
1082             );
1083              
1084             # stop using current batch size if no answer from NCBI servers
1085 0 0         last BATCH unless $report;
1086              
1087             # parse XML report into hash tree
1088 0           my $ob = XML::Bare->new( text => $report );
1089 0           my $root = $ob->parse();
1090              
1091             # capture GI number/taxon_id pairs for each eSummary of the report
1092 0           for my $docsum ( @{ $root->{eSummaryResult}{DocSum} } ) {
  0            
1093 0           my @fields = @{ $docsum->{Item} };
  0            
1094 0     0     my $gi = first { $_->{Name}->{value} eq 'Gi' } @fields;
  0            
1095 0     0     my $taxid = first { $_->{Name}->{value} eq 'TaxId' } @fields;
  0            
1096 0           $miss_taxid_for{ $gi->{value} } = $taxid->{value};
1097             }
1098             }
1099              
1100             # update hash with retrieved taxon_ids
1101 0           %taxid_for = (%taxid_for, %miss_taxid_for);
1102              
1103             # collect anew missing taxon_ids
1104 0           @miss_gis = grep { not $taxid_for{$_} } @uniq_gis;
  0            
1105              
1106             # if some taxon_ids are still missing...
1107 0 0         if (@miss_gis) {
1108              
1109             # reduce batch_size after MAX_ATTEMPT attempts
1110 0 0         if (++$attempt > $MAX_ATTEMPT) {
1111 0           $batch_size = floor($batch_size / $GOLDEN_RATIO);
1112 0           $attempt = 1;
1113             }
1114              
1115             # stop trying if batch_size falls to zero
1116 0 0         unless ($batch_size) {
1117 0           carp '[BMC] Warning: no answer from NCBI E-utilities;'
1118             . ' dropping GIs!';
1119 0           last ESUMMARY;
1120             }
1121             }
1122             }
1123              
1124 0           ### Unique GIs retrieved online: scalar keys %miss_taxid_for
  0            
1125 0           ### Unique GIs still missing: scalar @miss_gis
  0            
1126              
1127             # build new ids by prepending taxon_ids to full_ids
1128 0           my @taxids_gis;
1129 0           my $ea = each_array(@full_ids, @gis);
1130 0           while ( my ($full_id, $gi) = $ea->() ) {
1131 0           push @taxids_gis, $taxid_for{$gi} . '|' . $full_id;
1132             }
1133              
1134 0           return @taxids_gis;
1135             }
1136              
1137              
1138              
1139             sub tab_mapper {
1140 0     0 1   my $self = shift;
1141 0           my $infile = shift;
1142 0   0       my $args = shift // {};
1143              
1144 0   0       my $col = $args->{column} // 1;
1145 0   0       my $sep = $args->{separator} // qr{\t}xms;
1146 0           my $idm = $args->{gi2taxid};
1147              
1148 0           open my $in, '<', $infile;
1149              
1150 0           tie my %family_for, 'Tie::IxHash';
1151              
1152             LINE:
1153 0           while ( my $line = <$in>) {
1154 0           chomp $line;
1155              
1156             # skip empty lines and comment lines
1157 0 0 0       next LINE if $line =~ $EMPTY_LINE
1158             || $line =~ $COMMENT_LINE;
1159              
1160 0           my @fields = split $sep, $line;
1161 0           $family_for{ $fields[0] } = $fields[$col];
1162             }
1163              
1164 0           my @abbr_ids = keys %family_for;
1165 0           my @must_ids = @abbr_ids;
1166              
1167 0 0         if ($idm) {
1168              
1169             # build gi2taxid 'mapper'
1170 0           my $gi_mapper = IdMapper->load($idm);
1171 0           my @gis = map { $_->gi } $gi_mapper->all_abbr_seq_ids;
  0            
1172 0           my @taxon_ids = map { $_->taxon_id } $gi_mapper->all_long_seq_ids;
  0            
1173 0           my %taxid_for = mesh @gis, @taxon_ids;
1174              
1175             # build modern seq_ids from GIs using 'mapper'
1176 0           for my $id (@must_ids) {
1177 0           my $taxon_id = $taxid_for{ SeqId->new( full_id => $id )->gi };
1178 0           my @taxonomy = $self->get_taxonomy($taxon_id);
1179 0           my $org = $taxonomy[-1];
1180 0           $id = SeqId->new_with(
1181             org => $org,
1182             taxon_id => $taxon_id,
1183             gi => $id, # actually not a 'pure' GI here
1184             )->full_id;
1185             }
1186             }
1187              
1188 0           my @long_ids;
1189              
1190 0           my $ea = each_array @must_ids, @abbr_ids;
1191 0           while ( my ($must_id, $abbr_id) = $ea->() ) {
1192 0   0       ( my $family = $family_for{$abbr_id} // q{} ) =~ tr/- @_/./;
1193 0 0         $family .= '-' if $family;
1194 0           push @long_ids, $family . $must_id;
1195             }
1196              
1197 0           return IdMapper->new(
1198             long_ids => \@long_ids,
1199             abbr_ids => \@abbr_ids,
1200             );
1201             }
1202              
1203              
1204              
1205             sub tax_mapper { ## no critic (RequireArgUnpacking)
1206 0     0 1   my $self = shift;
1207 0           my $listable = shift;
1208              
1209             return IdMapper->new(
1210             long_ids => [ map {
1211 0           $self->get_nexus_label_from_seq_id($_, @_)
1212             } $listable->all_seq_ids ],
1213 0           abbr_ids => [ map { $_->full_id } $listable->all_seq_ids ],
  0            
1214             );
1215             }
1216              
1217              
1218              
1219             sub tax_filter {
1220 0     0 1   my $self = shift;
1221 0           my $list = shift;
1222              
1223 0           return Filter->new( tax => $self, _specs => $list );
1224             }
1225              
1226              
1227              
1228             sub tax_criterion {
1229 0     0 1   my $self = shift;
1230 0           my $args = shift;
1231              
1232 0           $args->{tax_filter} = $self->tax_filter( $args->{tax_filter} );
1233              
1234 0           return Criterion->new($args);
1235             }
1236              
1237              
1238              
1239             sub tax_category {
1240 0     0 1   my $self = shift;
1241 0           my $args = shift;
1242              
1243             $args->{criteria} = [
1244 0           map { $self->tax_criterion($_) } @{ $args->{criteria} }
  0            
  0            
1245             ];
1246              
1247 0           return Category->new($args);
1248             }
1249              
1250              
1251             # Classifier/Labeler/ColorScheme factory methods
1252              
1253              
1254             sub tax_classifier {
1255 0     0 1   my $self = shift;
1256 0           my $args = shift;
1257              
1258             $args->{categories} = [
1259 0           map { $self->tax_category($_) } @{ $args->{categories} }
  0            
  0            
1260             ];
1261              
1262 0           return Classifier->new($args);
1263             }
1264              
1265             # example of input HashRef for tax_classifier
1266             # 'min', 'max' and 'description' keys are both optional
1267             # categories => [
1268             # {
1269             # criteria => [
1270             # {
1271             # max => undef,
1272             # min => 1,
1273             # tax_filter => [
1274             # '+Latimeria'
1275             # ]
1276             # },
1277             # {
1278             # tax_filter => [
1279             # '+Protopterus'
1280             # ]
1281             # },
1282             # {
1283             # tax_filter => [
1284             # '+Danio',
1285             # '+Oreochromis'
1286             # ]
1287             # },
1288             # {
1289             # tax_filter => [
1290             # '+Xenopus'
1291             # ]
1292             # },
1293             # {
1294             # tax_filter => [
1295             # '+Anolis',
1296             # '+Gallus',
1297             # '+Meleagris',
1298             # '+Taeniopygia'
1299             # ]
1300             # },
1301             # {
1302             # tax_filter => [
1303             # '+Mammalia'
1304             # ]
1305             # }
1306             # ],
1307             # description => 'strict species sampling',
1308             # label => 'strict'
1309             # },
1310             # {
1311             # criteria => [
1312             # {
1313             # tax_filter => [
1314             # '+Latimeria'
1315             # ]
1316             # },
1317             # {
1318             # tax_filter => [
1319             # '+Protopterus'
1320             # ]
1321             # },
1322             # {
1323             # tax_filter => [
1324             # '+Danio',
1325             # '+Oreochromis'
1326             # ]
1327             # },
1328             # {
1329             # tax_filter => [
1330             # '+Amphibia',
1331             # '+Amniota'
1332             # ]
1333             # }
1334             # ],
1335             # description => 'loose species sampling',
1336             # label => 'loose'
1337             # }
1338             # ]
1339              
1340              
1341              
1342             sub tax_labeler_from_systematic_frame {
1343 0     0 1   my $self = shift;
1344 0           my $infile = shift;
1345              
1346             # Thursday 26 November 2015 at 15 hours 21
1347             # ((((((Crenarchaeota:371:88:-1,Korarchaeota:371:72:-1)a:15:80:-1,...)Tree of Life:3:16:-1;
1348              
1349             # ensure that we get a parseable tree from the .fra file
1350             # by considering only line 2 and turning all funny 'branch lengths' to 1
1351 0           my @lines = file($infile)->slurp;
1352 0           (my $newick_str = pop @lines) =~ s/(?: :-?(\d+) ){3}/:1/xmsg;
1353              
1354             # parse tree using Bio::Phylo
1355             # Note: keep whitespace because tip labels are not between quotes
1356 0           my $tree = parse(
1357             -format => 'newick',
1358             -string => $newick_str,
1359             -keep_whitespace => 1,
1360             )->first;
1361              
1362             # extract tip labels
1363 0           my @labels = map { $_->get_name } @{ $tree->get_terminals };
  0            
  0            
1364              
1365             # build classifier from labels
1366 0           return $self->tax_labeler_from_list( \@labels );
1367             }
1368              
1369              
1370              
1371             sub tax_labeler_from_list {
1372 0     0 1   my $self = shift;
1373 0           my $list = shift;
1374              
1375 0           return Labeler->new( tax => $self, labels => $list );
1376             }
1377              
1378              
1379              
1380             sub load_color_scheme { ## no critic (RequireArgUnpacking)
1381 0     0 1   my $self = shift;
1382 0           my $scheme = ColorScheme->new( tax => $self );
1383 0           return $scheme->load(@_);
1384             }
1385              
1386              
1387              
1388             sub eq_tax { ## no critic (RequireArgUnpacking)
1389 0     0 1   my $self = shift;
1390 0           my $got = shift;
1391 0           my $expect = shift;
1392 0           my $classifier = shift;
1393              
1394             # classify got and expect orgs
1395 0           my $got_taxon = $classifier->classify($got, @_);
1396 0           my $exp_taxon = $classifier->classify($expect, @_);
1397              
1398             # use context to decide what to return
1399             # list context: return taxon labels
1400 0 0         return ($got_taxon, $exp_taxon)
1401             if wantarray;
1402              
1403             # scalar context: compare taxon labels if both are defined
1404             return undef ## no critic (ProhibitExplicitReturnUndef)
1405 0 0 0       unless $got_taxon && $exp_taxon;
1406              
1407 0           return $got_taxon eq $exp_taxon;
1408             }
1409              
1410              
1411             # I/O METHODS
1412              
1413             const my $CACHEDB => 'cachedb.bin';
1414              
1415              
1416             sub new_from_cache { ## no critic (RequireArgUnpacking)
1417 0     0 1   my $class = shift;
1418 0           my %args = @_; # TODO: handle HashRef?
1419              
1420 0           ### Loading NCBI Taxonomy from binary cache file...
  0            
1421 0           my $tax_dir = dir( glob $args{tax_dir} );
1422 0           my $cachefile = file($tax_dir, $CACHEDB);
1423 0           my $tax = $class->load($cachefile, inject => { tax_dir => $tax_dir } );
1424              
1425 0           ### Done!
  0            
1426 0           return $tax;
1427             }
1428              
1429              
1430             sub update_cache {
1431 0     0 1   my $self = shift;
1432              
1433 0           my $cachefile = file($self->tax_dir, $CACHEDB);
1434 0           ### Updating binary cache file: $cachefile->stringify
  0            
1435 0           $self->store($cachefile);
1436              
1437 0           ### Done!
  0            
1438 0           return 1;
1439             }
1440              
1441              
1442             # class method to setup local taxonomy database
1443              
1444              
1445             sub setup_taxdir {
1446 0     0 1   my $class = shift;
1447 0           my $tax_dir = shift;
1448 0   0       my $args = shift // {}; # HashRef (should not be empty...)
1449              
1450 0           my $source = $args->{source};
1451              
1452 0 0         $class->_setup_ncbi_taxdir($tax_dir, $args)
1453             if $source eq 'ncbi';
1454              
1455 0 0         $class->_setup_gtdb_taxdir($tax_dir)
1456             if $source eq 'gtdb';
1457              
1458 0           return;
1459             }
1460              
1461              
1462             sub _setup_ncbi_taxdir {
1463 0     0     my $class = shift;
1464 0           my $tax_dir = shift;
1465 0   0       my $args = shift // {}; # HashRef (should not be empty...)
1466              
1467 0   0       my $gi_mapper = $args->{gi_mapper} // 0;
1468              
1469             # setup local directory
1470 0           $tax_dir = dir( glob $tax_dir );
1471 0           $tax_dir->mkpath();
1472              
1473 0           ### Installing NCBI Taxonomy database to: $tax_dir->stringify
  0            
1474 0           ### Please be patient...
  0            
1475              
1476             # setup remote archive access
1477 0           my $base = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy';
1478 0 0         my @targets = (
1479             'taxdump.tar.gz',
1480             $gi_mapper ? qw(gi_taxid_nucl.dmp.gz gi_taxid_prot.dmp.gz) : ()
1481             );
1482              
1483             # download and install file(s)...
1484             # ... first, the taxon_id version...
1485 0           my @dmpfiles;
1486 0           for my $target (@targets) {
1487 0           my $url = "$base/$target";
1488              
1489 0           ### Downloading: $url
  0            
1490 0           my $zipfile = file($tax_dir, $target)->stringify;
1491             # Note: stringify is required by getstore
1492 0           my $ret_code = getstore($url, $zipfile);
1493 0 0         croak "[BMC] Error: cannot download $url: error $ret_code; aborting!"
1494             unless $ret_code == 200;
1495              
1496             # TODO: use modules for unarchiving (not that easy)
1497              
1498 0           ### Unarchiving: $zipfile
  0            
1499 0 0         if ($target =~ m/\A gi_taxid/xms) { # GI mappers
1500 0           system("gzip -d $zipfile");
1501 0           push @dmpfiles, change_suffix( $zipfile, q{} );
1502             }
1503             else { # main tax archive
1504 0           system("tar -xzf $zipfile -C $tax_dir");
1505 0           file($zipfile)->remove;
1506             }
1507             }
1508              
1509             # delete gca files if any
1510 0           my $gcanamefile = file($tax_dir, 'gca0-names.dmp');
1511 0           my $gcanodefile = file($tax_dir, 'gca0-nodes.dmp');
1512 0 0         $gcanamefile->remove if -e $gcanamefile;
1513 0 0         $gcanodefile->remove if -e $gcanodefile;
1514              
1515             #... second, the accession_id version
1516 0           $class->_make_gca_files($tax_dir);
1517              
1518             # return true on success (only check main files)
1519 0 0 0       if ( -r $gcanamefile && -r $gcanodefile ) {
1520 0           ### Successfully wrote GCA-based files!
  0            
1521             }
1522              
1523             # delete cache if any
1524 0           my $cachefile = file($tax_dir, $CACHEDB);
1525 0 0         $cachefile->remove if -e $cachefile;
1526              
1527             # optionally build binary GI mapper from GI-to-taxid flat files
1528 0 0         if ($gi_mapper) {
1529 0           my $mrgfile = file($tax_dir, 'gi_taxid_nucl_prot.dmp')->stringify;
1530 0           ### Merging GI-to-taxid flat files to: $mrgfile
  0            
1531 0           system("sort -nm @dmpfiles > $mrgfile");
1532 0           my $binfile = change_suffix($mrgfile, '.bin');
1533 0           ### Building binary GI-to-taxid mapper: $binfile
  0            
1534 0           new_dict( in => $mrgfile, out => $binfile );
1535 0           file($_)->remove for @dmpfiles, $mrgfile;
1536             }
1537              
1538             # return true on success (only check main files)
1539 0 0 0       if ( -r file($tax_dir, 'names.dmp') && -r file($tax_dir, 'nodes.dmp') ) {
1540 0           ### Successfully wrote taxid files!
  0            
1541 0           return 1;
1542             }
1543              
1544 0           ### Failed installation!
  0            
1545 0           return 0;
1546             }
1547              
1548             sub _make_gca_files {
1549 0     0     my $class = shift;
1550 0           my $tax_dir = shift;
1551              
1552 0           const my $FS => qq{\t|\t};
1553 0           const my $FS_REGEX => qr{\t \| \t}xms;
1554              
1555 0           my %rank_for;
1556             my %parent_taxid_for;
1557              
1558             # open nodes.dmp file
1559 0           my $nodes = file($tax_dir, 'nodes.dmp');
1560 0           open my $in, '<', $nodes;
1561              
1562             # get taxon_id and parent_taxon_id from nodes.dmp file
1563 0           while (my $line = <$in>) {
1564 0           chomp $line;
1565 0           my ($taxon_id, $parent_taxon_id, $rank) = $line
1566             =~ m/^ (\d+) $FS_REGEX (\d+) $FS_REGEX ([^\t]+) $FS_REGEX/xmsg;
1567 0           $rank_for{$taxon_id} = $rank;
1568 0           $parent_taxid_for{$taxon_id} = $parent_taxon_id;
1569             }
1570              
1571             # create helper Taxonomy object
1572 0           my $tax = $class->new( tax_dir => $tax_dir );
1573              
1574             # define remote assembly reports filenames
1575             # Note: the order is significant (last files dominate over first files)
1576 0           my @targets_acc = qw(
1577             assembly_summary_genbank_historical.txt
1578             assembly_summary_refseq_historical.txt
1579             assembly_summary_genbank.txt
1580             assembly_summary_refseq.txt
1581             );
1582              
1583 0           my %name_for;
1584             my %node_for;
1585 0           my %fix_for;
1586              
1587 0           my $base_acc = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS';
1588              
1589 0           for my $target (@targets_acc) {
1590             #### $target
1591              
1592 0           my $url = "$base_acc/$target";
1593 0           my $file = file($tax_dir, $target)->stringify;
1594             #### $file
1595              
1596 0           ### Downloading: $url
  0            
1597 0           my $ret_code = getstore($url, $file);
1598 0 0         croak "[BMC] Error: cannot download $url: error $ret_code; aborting!"
1599             unless $ret_code == 200;
1600              
1601             # parse file for accession numbers and related taxon_ids
1602 0           open my $fh, '<', $file;
1603              
1604             LINE:
1605 0           while (my $line = <$fh>) {
1606 0           chomp $line;
1607              
1608             # skip empty lines and comment lines
1609 0 0 0       next LINE if $line =~ $EMPTY_LINE
1610             || $line =~ $COMMENT_LINE;
1611              
1612 0           my ($accession, $taxon_id, $species_taxon_id)
1613             = (split /\t/xms, $line)[0,5,6];
1614              
1615             # update merged taxon_id (mostly from historical assembly files)
1616 0 0         $taxon_id = $tax->merged_for($taxon_id)
1617             if $tax->is_merged($taxon_id);
1618 0 0         $species_taxon_id = $tax->merged_for($species_taxon_id)
1619             if $tax->is_merged($species_taxon_id);
1620              
1621             # skip deleted nodes (again mostly from historical assembly files)
1622 0 0         next LINE if $tax->is_deleted($taxon_id);
1623              
1624             # fetch taxonomy and org using taxon_id
1625 0           my @taxonomy = $tax->get_taxonomy($taxon_id);
1626 0           my $org = $taxonomy[-1];
1627              
1628             # use parent taxon_id if no taxon_id for strain
1629 0 0         if ($species_taxon_id == $taxon_id) {
1630 0           $species_taxon_id = $parent_taxid_for{$taxon_id};
1631 0           $fix_for{$taxon_id}{name_for} = $org;
1632 0           $fix_for{$taxon_id}{node_for} = $species_taxon_id;
1633             }
1634              
1635 0           $name_for{$accession} = $org;
1636 0           $node_for{$accession} = $species_taxon_id;
1637             }
1638 0           close $fh;
1639             }
1640              
1641             # write names.dmp
1642 0           my $names_gca_file = file($tax_dir, 'gca0-names.dmp');
1643 0           open my $names_out, '>', $names_gca_file;
1644              
1645 0           for my $accession ( keys %name_for ) {
1646 0           say {$names_out} join $FS,
1647 0           $accession, $name_for{$accession}, q{}, 'scientific name';
1648             }
1649 0           for my $taxon_id ( keys %fix_for ) {
1650 0           say {$names_out} join $FS,
1651 0           $taxon_id, $fix_for{$taxon_id}{name_for}, q{}, 'scientific name';
1652 0           $rank_for{$fix_for{$taxon_id}{name_for}} = $rank_for{$taxon_id};
1653             }
1654              
1655             # write nodes.dmp
1656 0           my $nodes_gca_file = file($tax_dir, 'gca0-nodes.dmp');
1657 0           open my $nodes_out, '>', $nodes_gca_file;
1658              
1659 0           for my $accession ( keys %node_for ) {
1660 0           say {$nodes_out} join $FS, $accession, $node_for{$accession},
1661 0   0       $rank_for{$name_for{$accession}} // 'no rank';
1662             }
1663 0           for my $taxon_id ( keys %fix_for ) {
1664 0           say {$nodes_out} join $FS, $taxon_id, $fix_for{$taxon_id}{node_for},
1665 0           $rank_for{$taxon_id};
1666             }
1667              
1668 0           return;
1669             }
1670              
1671              
1672             # http://www.ncbi.nlm.nih.gov/news/11-21-2013-strain-id-changes/
1673             #
1674             # Planned change in bacterial strain-level information management
1675             #
1676             # Please be aware that there is an upcoming change (January 2014) in how NCBI
1677             # manages organism strain information. Due to significant increases in the
1678             # volume of strain-specific sequencing, we are changing our management of
1679             # strain information.
1680             #
1681             # Next generation sequencing has already changed the way microbial genomes are
1682             # being used. The scope of microbial sequencing projects has shifted from a
1683             # single isolate representing an organism to multi-isolate and multi-species
1684             # projects representing microbial communities. Consequently, in the first nine
1685             # months of 2013 the sequences of more than 6000 prokaryotic genomes were
1686             # released by INSDC (DDBJ/ENA/GenBank).
1687             #
1688             # NCBI is introducing several changes in prokaryotic genomes and related
1689             # resources such as Assembly, BioProject, BioSample, and Taxonomy that will
1690             # affect your submissions, data downloads, analysis tools, and parsers.
1691             #
1692             # Taxonomy
1693             #
1694             # Assigning strain-level TaxID will be discontinued in January 2014 because
1695             # curation of strain-level TaxIDs will not remain possible under such growth.
1696             # However, the thousands of existing strain-level TaxIDs will remain, and we
1697             # will continue to add informal strain-specific names for genomes from
1698             # specimens that have not yet been identified to the species level, e.g.
1699             # “Rhizobium sp. CCGE 510” and “Micromonas sp. RCC299”. The strain information
1700             # will continue to be collected and displayed.
1701             #
1702             # BioSample
1703             #
1704             # Submitters of genome sequences will be required to register sample meta-data
1705             # in the BioSample database for each organism that they are sequencing. The
1706             # BioSample submission will include the strain information and other metadata,
1707             # such as culture collection and isolation information, as appropriate. The
1708             # BioSample accession will be a link on the GenBank records, and the GenBank
1709             # records themselves will display the strain in the source information.
1710             #
1711             # BioProject
1712             #
1713             # Submitters of genome sequences are already required to register meta-data
1714             # about the research project in the BioProject database. We no longer require
1715             # a one-to-one relationship between a BioProject accession and a genome.
1716             # Instead, a research effort examining multiple strains of a species or
1717             # multiple species of drug-resistant bacteria, for example, could be
1718             # registered as a single BioProject.
1719             #
1720             # Assembly
1721             #
1722             # Each genome assembly is loaded to the Assembly database and assigned an
1723             # Assembly accession. The Assembly accession is specific for a particular
1724             # genome submission.
1725             #
1726             # What defines a genome?
1727             #
1728             # A BioProject ID or accession cannot be used to define a single genome, since
1729             # many may belong to a multi-isolate or multi-species project. Furthermore, a
1730             # TaxID can no longer reliably define an individual genome since unique TaxIDs
1731             # will not be assigned for individual strains and isolates. The collection of
1732             # DNA sequences of an individual sample (isolate) will be represented by a
1733             # BioSample accession and if raw sequence reads are assembled and submitted to
1734             # GenBank they will get a unique Assembly accession. The Assembly accession is
1735             # specific for a particular genome submission. For example, sequence data
1736             # generated from a single sample (with a BioSample accession) could be
1737             # assembled with two different algorithms and so have two sets of GenBank
1738             # accessions, each with its own Assembly accession.
1739             # BioSample accession and each assembled genome has its own Assembly
1740             # accession. This BioProject includes an isolate of Listeria monocytogenes
1741             # (TaxID 1639, strain R2-502) which was registered as BioSample SAMN02203126,
1742             # and its genome is represented in GenBank records CP006595-CP006596, which
1743             # are tracked as a group in the Assembly database under accession
1744             # GCA_000438585.
1745             #
1746             # FTP files
1747             #
1748             # Genome text reports on the FTP site have been modified to include the
1749             # BioSample and Assembly accessions. These two columns were added at the end
1750             # of the tables to minimize problems for existing parsers. Initially, not all
1751             # assemblies will have a BioSample accession because we are still in the
1752             # process of back-filling BioSamples for genomes.
1753             #
1754             # These changes will occur in January 2014. We will be releasing more
1755             # information as the date approaches.
1756              
1757             sub _setup_gtdb_taxdir {
1758 0     0     my $class = shift;
1759 0           my $tax_dir = shift;
1760              
1761             # setup local directory
1762 0           $tax_dir = dir( glob $tax_dir );
1763 0           $tax_dir->mkpath();
1764              
1765 0           ### Installing GTDB Taxonomy database to: $tax_dir->stringify
  0            
1766 0           ### Please be patient...
  0            
1767              
1768             # setup remote archive access
1769 0           my $base = 'https://data.gtdb.ecogenomic.org/releases/latest';
1770 0           my @targets = (
1771             'ar122_metadata.tar.gz' , 'bac120_metadata.tar.gz',
1772             'FILE_DESCRIPTIONS', 'METHODS', 'RELEASE_NOTES', 'VERSION'
1773             );
1774              
1775 0           for my $target (@targets) {
1776 0           my $url = "$base/$target";
1777              
1778 0           ### Downloading: $url
  0            
1779 0           my $zipfile = file($tax_dir, $target)->stringify;
1780 0           my $ret_code = getstore($url, $zipfile);
1781 0 0         croak "[BMC] Error: cannot download $url: error $ret_code; aborting!"
1782             unless $ret_code == 200;
1783              
1784 0 0         if ($target =~ m/metadata/xms) {
1785 0           ### Unarchiving: $zipfile
  0            
1786 0           system("tar -xzf $zipfile -C $tax_dir");
1787 0           file($zipfile)->remove;
1788             }
1789             }
1790              
1791 0           my $arc_file = file($tax_dir, 'ar122_metadata*.tsv');
1792 0           my $bac_file = file($tax_dir, 'bac120_metadata*.tsv');
1793 0           my $new_arcfile = file($tax_dir, 'archaea_metadata.tsv');
1794 0           my $new_bacfile = file($tax_dir, 'bacteria_metadata.tsv');
1795              
1796             # change file name to avoid GTDB version in basename
1797 0 0         $new_arcfile->remove if -e $new_arcfile;
1798 0 0         $new_bacfile->remove if -e $new_bacfile;
1799 0           system("mv $arc_file $new_arcfile");
1800 0           system("mv $bac_file $new_bacfile");
1801              
1802             # return true on success (only check main files)
1803 0 0 0       if ( -r $new_arcfile && -r $new_bacfile ) {
1804 0           ### Successfully downloaded metadata files!
  0            
1805             }
1806              
1807             else {
1808 0           ### Failed installation!
  0            
1809 0           return 0;
1810             }
1811              
1812             # concatenate metadata TSV files
1813 0           my $prok_file = file($tax_dir, 'prok_metadata.tsv');
1814 0 0         $prok_file->remove if -e $prok_file;
1815 0           system("cat $new_arcfile $new_bacfile > $prok_file");
1816              
1817             # create hash from metadata file
1818 0           my $table_for = _read_gtdb_metadata($prok_file);
1819              
1820             # hash for rank code
1821             my %rank_for = map {
1822 0 0         $_ eq 'superkingdom' ? 'd__' : substr($_, 0, 1) . '__' => $_
  0            
1823             } qw (superkingdom phylum class order family genus species);
1824              
1825             # setup taxonomic tree
1826 0           my %tree = (
1827             1 => {
1828             rank => 'no rank',
1829             name => 'root',
1830             uniq_name => q{},
1831             children => {
1832             2 => {
1833             rank => 'no rank',
1834             name => 'cellular organisms',
1835             children => {},
1836             },
1837             },
1838             }
1839             );
1840              
1841             # fill up taxonomic tree
1842 0           for my $gca ( keys %{$table_for} ) {
  0            
1843              
1844             # get GTDB taxonomy
1845 0           my $lineage = $table_for->{$gca}{'gtdb_taxonomy'};
1846 0           my @taxonomy = split q{;}, $lineage;
1847              
1848 0           my $tree_ref = $tree{1}{children}{2}{children};
1849              
1850             # count duplicate taxon in lineage
1851 0     0     my %count_for = count_by { substr( $_, 3, length()-1 ) } @taxonomy;
  0            
1852              
1853 0           while (my $gtdb_taxon = shift @taxonomy) {
1854              
1855             # get taxon rank
1856 0           my ($rank_code, $taxon) = ($gtdb_taxon) =~ m/^([a-z]__)(.*)/xms;
1857 0           my $rank = $rank_for{$rank_code};
1858              
1859             # create taxid for taxon
1860 0           my $taxon_id = $rank_code . lc($taxon =~ tr/A-Za-z0-9//cdr);
1861              
1862             # create taxid entry (if not yet existing)
1863 0 0         unless ( $tree_ref->{$taxon_id} ) {
1864 0           $tree_ref->{$taxon_id}{name} = $taxon;
1865 0           $tree_ref->{$taxon_id}{rank} = $rank;
1866             }
1867              
1868             # set a unique name (if duplicate taxon)
1869 0 0         my $uniq_name = $count_for{$taxon} > 1
1870             ? $taxon . ' <' . lc $rank . '>' : q{};
1871             # ... and add it in hash
1872 0 0         $tree_ref->{$taxon_id}{uniq_name} = $uniq_name
1873             if $uniq_name;
1874              
1875             # store GCA|F
1876 0 0         if ($rank eq 'species') {
1877              
1878             # change GCF to GCA and store it
1879 0 0         if ($gca =~ m/^GCF/xms) {
1880 0           (my $new_gca = $gca ) =~ s/GCF/GCA/xms;
1881 0           push @{ $tree_ref->{$taxon_id}{gca} }, $new_gca;
  0            
1882             }
1883              
1884 0           push @{ $tree_ref->{$taxon_id}{gca} }, $gca;
  0            
1885             }
1886              
1887             # setup children entry if lineage not yet exhausted
1888 0 0         if (@taxonomy) {
1889 0   0       $tree_ref->{$taxon_id}{children} //= {};
1890             $tree_ref = $tree_ref->{$taxon_id}{children}
1891 0           }
1892             }
1893             }
1894              
1895             # taxid files
1896 0           my $name_file = file($tax_dir, 'names.dmp');
1897 0           my $node_file = file($tax_dir, 'nodes.dmp');
1898 0           open my $name_out, '>', $name_file;
1899 0           open my $node_out, '>', $node_file;
1900              
1901             # gca files
1902 0           my $gcanamefile = file($tax_dir, 'gca0-names.dmp');
1903 0           my $gcanodefile = file($tax_dir, 'gca0-nodes.dmp');
1904 0           open my $gcaname_out, '>', $gcanamefile;
1905 0           open my $gcanode_out, '>', $gcanodefile;
1906              
1907             # create empty additional dmp files
1908 0           file($tax_dir, 'delnodes.dmp')->spew;
1909 0           file($tax_dir, 'merged.dmp' )->spew;
1910              
1911             # write regular dmp files (first recursive call)
1912             _write_dmp_files( $name_out, $node_out, $gcaname_out, $gcanode_out,
1913 0           1, 1, $tree{1} );
1914              
1915 0 0 0       if ( -r $name_file && -r $node_file ) {
1916 0           ### Successfully wrote taxid files!
  0            
1917             }
1918              
1919 0 0 0       if ( -r $gcanamefile && -r $gcanodefile ) {
1920 0           ### Successfully wrote GCA-based files!
  0            
1921             }
1922              
1923 0           return;
1924             }
1925              
1926             sub _read_gtdb_metadata {
1927 0     0     my $infile = shift;
1928              
1929 0           open my $in, '<', $infile;
1930              
1931 0           my %table_for;
1932             my @keys;
1933              
1934             LINE:
1935 0           while (my $line = <$in>) {
1936 0           chomp $line;
1937              
1938 0 0         if ($line =~ m/^accession/xms) {
1939 0           (undef, @keys) = split /\t/xms, $line;
1940 0           next LINE;
1941             }
1942              
1943 0           my ($gca, @values) = split /\t/xms, $line;
1944 0           $gca =~ s/GB_|RS_//xms;
1945 0           $table_for{$gca} = { mesh @keys, @values };
1946             }
1947              
1948 0           return \%table_for;
1949             }
1950              
1951             sub _write_dmp_files { ## no critic (ProhibitManyArgs)
1952 0     0     my ($name_out, $node_out, $gcaname_out, $gcanode_out,
1953             $taxon_id, $parent, $tree_ref) = @_;
1954              
1955 0           my $name = $tree_ref->{name };
1956 0           my $rank = $tree_ref->{rank };
1957 0   0       my $uniq_name = $tree_ref->{uniq_name} // q{};
1958 0   0       my $gcas = $tree_ref->{gca } // [];
1959              
1960             # write gca files
1961 0 0         if ($gcas) {
1962             _append2dmp($gcaname_out, $gcanode_out,
1963             $_, $parent, $name, $rank, $uniq_name)
1964 0           for sort @{$gcas};
  0            
1965             }
1966              
1967             # write taxid files
1968 0           _append2dmp($name_out, $node_out,
1969             $taxon_id, $parent, $name, $rank, $uniq_name);
1970              
1971 0           my $children_for = $tree_ref->{children};
1972 0 0         return unless $children_for;
1973              
1974 0           $parent = $taxon_id;
1975              
1976             _write_dmp_files( $name_out, $node_out, $gcaname_out, $gcanode_out,
1977             $_, $parent, $children_for->{$_} )
1978 0           for sort keys %{$children_for};
  0            
1979              
1980 0           return;
1981             }
1982              
1983             sub _append2dmp { ## no critic (ProhibitManyArgs)
1984 0     0     my ($name_out, $node_out,
1985             $taxon_id, $parent, $name, $rank, $uniq_name) = @_;
1986              
1987             # write names file
1988 0           say {$name_out} join "\t",
  0            
1989             $taxon_id , '|',
1990             $name , '|',
1991             $uniq_name, '|',
1992             'scientific name'
1993             ;
1994              
1995             # write nodes file
1996 0           say {$node_out} join "\t",
  0            
1997             $taxon_id, '|', $parent, '|', $rank,
1998             ( join "\t", '|', q{} ) x 10;
1999              
2000 0           return;
2001             }
2002              
2003 17     17   212 no Moose::Util::TypeConstraints;
  17         55  
  17         237  
2004              
2005             __PACKAGE__->meta->make_immutable;
2006             1;
2007              
2008             __END__
2009              
2010             =pod
2011              
2012             =head1 NAME
2013              
2014             Bio::MUST::Core::Taxonomy - NCBI Taxonomy one-stop shop
2015              
2016             =head1 VERSION
2017              
2018             version 0.212670
2019              
2020             =head1 SYNOPSIS
2021              
2022             # TODO
2023              
2024             =head1 DESCRIPTION
2025              
2026             # TODO
2027              
2028             =head1 METHODS
2029              
2030             =head2 get_taxid_from_seq_id
2031              
2032             =head2 get_taxid_from_legacy_seq_id
2033              
2034             =head2 get_taxonomy_from_seq_id
2035              
2036             =head2 get_taxid_from_taxonomy
2037              
2038             =head2 get_taxonomy_with_levels_from_seq_id
2039              
2040             =head2 get_taxa_from_taxid
2041              
2042             =head2 get_nexus_label_from_seq_id
2043              
2044             =head2 get_common_taxonomy_from_seq_ids
2045              
2046             =head2 attach_taxonomies_to_terminals
2047              
2048             =head2 attach_taxonomies_to_internals
2049              
2050             =head2 attach_taxa_to_entities
2051              
2052             =head2 gi_mapper
2053              
2054             =head2 tab_mapper
2055              
2056             =head2 tax_mapper
2057              
2058             =head2 tax_filter
2059              
2060             =head2 tax_criterion
2061              
2062             =head2 tax_category
2063              
2064             =head2 tax_classifier
2065              
2066             =head2 tax_labeler_from_systematic_frame
2067              
2068             =head2 tax_labeler_from_list
2069              
2070             =head2 load_color_scheme
2071              
2072             =head2 eq_tax
2073              
2074             =head2 setup_taxdir
2075              
2076             =head1 I/O METHODS
2077              
2078             =head2 new_from_cache
2079              
2080             =head2 update_cache
2081              
2082             =head1 ALIASES
2083              
2084             =head2 fetch_lineage
2085              
2086             =head2 compute_lca
2087              
2088             =head1 AUTHOR
2089              
2090             Denis BAURAIN <denis.baurain@uliege.be>
2091              
2092             =head1 CONTRIBUTORS
2093              
2094             =for stopwords Loic MEUNIER Mick VAN VLIERBERGHE
2095              
2096             =over 4
2097              
2098             =item *
2099              
2100             Loic MEUNIER <loic.meunier@doct.uliege.be>
2101              
2102             =item *
2103              
2104             Mick VAN VLIERBERGHE <mvanvlierberghe@doct.uliege.be>
2105              
2106             =back
2107              
2108             =head1 COPYRIGHT AND LICENSE
2109              
2110             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
2111              
2112             This is free software; you can redistribute it and/or modify it under
2113             the same terms as the Perl 5 programming language system itself.
2114              
2115             =cut