File Coverage

blib/lib/Bio/MUST/Core/Taxonomy.pm

Criterion	Covered	Total	%
statement	123	749	16.4
branch	0	144	0.0
condition	0	73	0.0
subroutine	41	93	44.0
pod	22	22	100.0
total	186	1081	17.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Core::Taxonomy;
2							# ABSTRACT: NCBI Taxonomy one-stop shop
3							# CONTRIBUTOR: Loic MEUNIER <loic.meunier@doct.uliege.be>
4							# CONTRIBUTOR: Mick VAN VLIERBERGHE <mvanvlierberghe@doct.uliege.be>
5							$Bio::MUST::Core::Taxonomy::VERSION = '0.212670';
6	17			17		133	use Moose;
	17					44
	17					133
7	17			17		120326	use namespace::autoclean;
	17					47
	17					152
8
9	17			17		1820	use autodie;
	17					43
	17					135
10	17			17		96134	use feature qw(say);
	17					46
	17					1574
11
12	17			17		137	use Smart::Comments '###';
	17					47
	17					189
13
14	17			17		744802	use MooseX::Storage;
	17					518838
	17					79
15							with Storage('io' => 'StorableFile');
16
17	17			17		5903	use Moose::Util::TypeConstraints;
	17					43
	17					177
18
19	17			17		52616	use Algorithm::NeedlemanWunsch;
	17					59363
	17					670
20	17			17		157	use Carp;
	17					43
	17					1073
21	17			17		124	use Const::Fast;
	17					44
	17					173
22	17			17		10287	use File::Find::Rule;
	17					144242
	17					189
23	17			17		1497	use IPC::System::Simple qw(system);
	17					48
	17					1171
24	17			17		122	use List::AllUtils 0.12 qw(first firstidx uniq each_array mesh count_by max_by);
	17					458
	17					1288
25	17			17		8285	use LWP::Simple qw(get getstore);
	17					1109978
	17					192
26	17			17		3371	use Path::Class qw(dir file);
	17					44
	17					1085
27	17			17		124	use POSIX;
	17					44
	17					166
28	17			17		33247	use Scalar::Util qw(looks_like_number);
	17					66
	17					957
29	17			17		140	use Try::Tiny;
	17					43
	17					1014
30	17			17		8658	use Try::Tiny::Warnings;
	17					12318
	17					1024
31	17			17		9883	use XML::Bare;
	17					135481
	17					1026
32
33	17			17		8694	use Bio::LITE::Taxonomy::NCBI::Gi2taxid qw(new_dict);
	17					166013
	17					1177
34	17			17		169	use Bio::Phylo::IO qw(parse);
	17					48
	17					911
35
36	17			17		127	use Bio::MUST::Core::Types;
	17					42
	17					575
37	17			17		113	use Bio::MUST::Core::Constants qw(:ncbi :files);
	17					40
	17					3681
38	17			17		139	use Bio::MUST::Core::Utils qw(change_suffix);
	17					36
	17					885
39	17			17		130	use aliased 'Bio::MUST::Core::SeqId';
	17					45
	17					192
40	17			17		4232	use aliased 'Bio::MUST::Core::IdList';
	17					45
	17					79
41	17			17		3462	use aliased 'Bio::MUST::Core::IdMapper';
	17					54
	17					67
42	17			17		3487	use aliased 'Bio::MUST::Core::Tree';
	17					42
	17					74
43	17			17		3376	use aliased 'Bio::MUST::Core::Taxonomy::MooseNCBI';
	17					47
	17					86
44	17			17		2451	use aliased 'Bio::MUST::Core::Taxonomy::Filter';
	17					51
	17					117
45	17			17		2283	use aliased 'Bio::MUST::Core::Taxonomy::Criterion';
	17					43
	17					122
46	17			17		2260	use aliased 'Bio::MUST::Core::Taxonomy::Category';
	17					45
	17					114
47	17			17		2123	use aliased 'Bio::MUST::Core::Taxonomy::Classifier';
	17					50
	17					111
48	17			17		2090	use aliased 'Bio::MUST::Core::Taxonomy::Labeler';
	17					49
	17					84
49	17			17		2139	use aliased 'Bio::MUST::Core::Taxonomy::ColorScheme';
	17					42
	17					118
50
51
52							# public path to NCBI Taxonomy dump directory
53							has 'tax_dir' => (
54							traits => ['DoNotSerialize'],
55							is => 'ro',
56							isa => 'Bio::MUST::Core::Types::Dir',
57							required => 1,
58							coerce => 1,
59							);
60
61
62							# Note: init_arg => undef had to be removed to allow proper serialization
63							# this is needed because MooseX::Storage has never accepted my patches
64							# see https://rt.cpan.org/Public/Bug/Display.html?id=65733
65
66							has '_ncbi_tax' => (
67							is => 'ro',
68							isa => 'Bio::MUST::Core::Taxonomy::MooseNCBI',
69							lazy => 1,
70							builder => '_build_ncbi_tax',
71							handles => qr{get_\w+}xms, # expose Bio::LITE::Taxonomy accessor methods
72							);
73
74							# Note: this is related to (yet different from)
75							has '_gi_mapper' => ( # the nearly homonymous 'gi_mapper' method
76							traits => ['DoNotSerialize'],
77							is => 'ro',
78							isa => 'Bio::LITE::Taxonomy::NCBI::Gi2taxid',
79							lazy => 1,
80							builder => '_build_gi_mapper',
81							handles => {
82							get_taxid_from_gi => 'get_taxid',
83							},
84							);
85
86
87							has '_is_deleted' => (
88							traits => ['Hash'],
89							is => 'ro',
90							isa => 'HashRef[Bool]',
91							lazy => 1,
92							builder => '_build_is_deleted',
93							handles => {
94							'is_deleted' => 'defined',
95							},
96							);
97
98
99							has '_' . $_ . '_for' => (
100							traits => ['Hash'],
101							is => 'ro',
102							isa => 'HashRef[Str]',
103							lazy => 1,
104							builder => '_build_' . $_ . '_for',
105							handles => {
106							'is_' . $_ => 'defined',
107							$_ . '_for' => 'get',
108							},
109							) for qw(merged misleading);
110
111
112							has '_dupes_for' => (
113							traits => ['Hash'],
114							is => 'ro',
115							isa => 'HashRef[HashRef[Str]]',
116							lazy => 1,
117							builder => '_build_dupes_for',
118							handles => {
119							'is_dupe' => 'defined',
120							'dupes_for' => 'get',
121							},
122							);
123
124							# TODO: change this name? to avoid confusion with othe uses of rank_for
125							has '_rank_for' => (
126							traits => ['Hash'],
127							is => 'ro',
128							isa => 'HashRef[Str]',
129							lazy => 1,
130							builder => '_build_rank_for',
131							handles => {
132							'rank_for' => 'get',
133							},
134							);
135
136
137							has '_strain_taxid_for' => (
138							traits => ['Hash'],
139							is => 'ro',
140							isa => 'HashRef[HashRef[Str]]',
141							lazy => 1,
142							builder => '_build_strain_taxid_for',
143							handles => {
144							'get_strain_taxid_for' => 'get', # TODO: rename?
145							},
146							);
147
148
149							has '_matcher' => (
150							traits => ['DoNotSerialize'],
151							is => 'ro',
152							isa => 'Algorithm::NeedlemanWunsch',
153							init_arg => undef,
154							lazy => 1,
155							builder => '_build_matcher',
156							handles => {
157							nw_score => 'align',
158							},
159							);
160
161							## no critic (ProhibitUnusedPrivateSubroutines)
162
163							sub _build_ncbi_tax {
164	0			0			my $self = shift;
165
166							#### in _build_ncbi_tax
167
168	0						local $_ = q{}; # to avoid loosing $_ due to Bio::LITE... constructor
169
170							# open pipes from (possibly) multiple names.dmp and nodes.dmp
171	0						my $names_txid = file($self->tax_dir, 'names*.dmp' );
172	0						my $names_gca = file($self->tax_dir, 'gca*names.dmp');
173	0						my $nodes_txid = file($self->tax_dir, 'nodes*.dmp' );
174	0						my $nodes_gca = file($self->tax_dir, 'gca*nodes.dmp');
175
176							# Note: taxid files are loaded last to get precedence in get_taxid_from_name
177	0						my @names_files = ($names_gca, $names_txid);
178	0						my @nodes_files = ($nodes_txid, $nodes_gca);
179
180							# Note: names.dmp files are reordered on-the-fly so as scientific names
181							# get precedence over other names (e.g., synonyms) in case of duplicates
182							# Note: this should not be useful anymore (due to _dupes_for attribute)
183	0						my $ol = q{'if (index($_, "scientific name") == -1) { print } else { push @sns, $_ } END{ print for @sns }'};
184	0						open my $names_fh, '-\|', qq{perl -nle $ol @names_files 2> /dev/null};
185	0						open my $nodes_fh, '-\|', qq{cat @nodes_files 2> /dev/null};
186
187	0						return MooseNCBI->new(
188							names => $names_fh,
189							nodes => $nodes_fh,
190
191							# allow most NCBI Taxonomy synonyms classes
192							# cut -f4 -d'\|' names.dmp \| sort \| uniq -c
193							# last updated on Aug-18-2021
194							# 1582 acronym
195							# 576856 authority
196							# 228 blast name
197							# 14555 common name
198							# 52040 equivalent name
199							# 484 genbank acronym
200							# 29979 genbank common name
201							# 1096 genbank synonym
202							# 733 in-part
203							# 61755 includes
204							# 2354398 scientific name
205							# 199012 synonym
206							# 167825 type material
207
208							synonyms => [
209							'synonym', 'genbank synonym',
210							'acronym', 'genbank acronym',
211							'anamorph', 'genbank anamorph', 'teleomorph', # now useless
212							'blast name', 'common name', 'genbank common name',
213							'equivalent name', 'includes',
214							# 'authority', 'in-part', 'type material',
215							]
216							);
217							}
218
219							sub _build_gi_mapper {
220	0			0			my $self = shift;
221
222							#### in _build_gi_mapper
223
224	0						local $_ = q{}; # to avoid loosing $_ due to Bio::LITE... constructor
225
226	0						return Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(
227							dict => file($self->tax_dir, 'gi_taxid_nucl_prot.bin'),
228							save_mem => 1, # 0 does not work on my MacBook Air
229							);
230							}
231
232							sub _build_is_deleted {
233	0			0			my $self = shift;
234
235							#### in _build_is_deleted
236
237	0						my %is_deleted;
238
239	0						my $infile = file($self->tax_dir, 'delnodes.dmp');
240	0						open my $in, '<', $infile;
241
242	0						while (my $line = <$in>) {
243	0						my ($taxon_id) = $line =~ m/^(\d+)/xms;
244	0						$is_deleted{$taxon_id} = 1;
245							}
246
247	0						return \%is_deleted;
248							}
249
250							sub _build_merged_for {
251	0			0			my $self = shift;
252
253							#### in _build_merged_for
254
255	0						my %merged_for;
256
257	0						my $infile = file($self->tax_dir, 'merged.dmp');
258	0						open my $in, '<', $infile;
259
260	0						while (my $line = <$in>) {
261	0						chomp $line;
262	0						my ($old_taxid, $new_taxid) = split /\s\\|\s/xms, $line;
263	0						$merged_for{$old_taxid} = $new_taxid;
264							}
265
266	0						return \%merged_for;
267							}
268
269							sub _build_misleading_for {
270	0			0			my $self = shift;
271
272							#### in _build_misleading_for
273
274	0						my %misleading_for;
275
276							# TODO: improve warning suppression
277							# cat: .../taxdump/misleading*.dmp: No such file or directory
278	0						my $misleadings = file($self->tax_dir, 'misleading*.dmp');
279	0						open my $in, '-\|', "cat $misleadings 2> /dev/null";
280
281	0						while (my $line = <$in>) {
282	0						chomp $line;
283	0						my ($taxid, $full_org) = split /\s\\|\s/xms, $line;
284	0						$misleading_for{$full_org} = $taxid;
285							}
286
287	0						return \%misleading_for;
288							}
289
290							sub _build_dupes_for {
291	0			0			my $self = shift;
292
293							#### in _build_dupes_for
294
295	0						my %taxids_for;
296
297	0						my $infile = file($self->tax_dir, 'names.dmp');
298	0						open my $in, '<', $infile;
299
300							# build hash of NCBI names => taxon_id(s) for all taxa
301
302							LINE:
303	0						while (my $line = <$in>) {
304
305							# focus on scientific names
306	0	0					next LINE unless $line =~ m/scientific \s name/xms;
307
308							# Note: old code was tolerant to "minor" duplications:
309							# do not consider as duplicates genus and subgenus that are the same
310							# similarly ignore duplicates involving phylum vs other levels
311							# phyla come after classes and synonyms (and thus should win)
312							# next LINE if $line =~ m/genus>/xms \|\| $line =~ m/<phylum>/xms;
313							# next LINE if $line =~ m/<actinobacteria>/xms; # workaround...
314
315							# extract taxon and track taxon_id
316	0						chomp $line;
317	0						my ($taxon_id, $taxon) = (split /\s\\|\s/xms, $line)[0..1];
318	0						push @{ $taxids_for{$taxon} }, $taxon_id;
	0
319							}
320
321	0						my %dupes_for;
322
323							# create helper Taxonomy object
324	0						my $tax = $self->new( tax_dir => $self->tax_dir );
325
326							TAXON:
327	0						while (my ($taxon, $taxids) = each %taxids_for) {
328
329							# only proceed for duplicate taxa
330	0	0					next TAXON if @{$taxids} < 2;
	0
331
332							# build hash of partial lineage => taxon_id only for duplicate taxa
333	17			17		22998	no warnings 'uninitialized'; # avoid undef due to 2-4-level lineages
	17					51
	17					2170
334							my %taxid_for = map {
335	0						( join q{; }, ( $tax->get_taxonomy($_) )[-5..-1] ) => $_
336	0						} @{$taxids};
	0
337	17			17		155	use warnings;
	17					45
	17					42014
338
339							# warn of taxa that end by the same three last taxa
340							# Note: this looks like a NCBI bug a couple of scientific names
341							carp "[BMC] Note: $taxon cannot be disambiguated."
342	0	0					. ' This should not be an issue.' if keys %taxid_for < @{$taxids};
	0
343
344	0						$dupes_for{$taxon} = \%taxid_for;
345							}
346
347	0						return \%dupes_for;
348							}
349
350							# Note: taken from nodes.dmp as follows (then manually re-ordered)
351							# cut -f3 -d'\|' nodes.dmp \| sort \| uniq
352							# last updated on Aug-18-2021
353							const my @LEVELS => (
354							'skip', # default collapsing level
355							'top', # useful to preserve 'cellular organisms'
356							'superkingdom', 'kingdom', 'subkingdom',
357							'superphylum', 'phylum', 'subphylum',
358							'superclass', 'class', 'subclass', 'infraclass',
359							'cohort', 'subcohort',
360							'superorder', 'order', 'suborder', 'infraorder', 'parvorder',
361							'superfamily', 'family', 'subfamily',
362							'tribe', 'subtribe',
363							'genus', 'subgenus',
364							'section', 'subsection',
365							'series',
366							'species group', 'species subgroup', 'species', 'subspecies',
367
368							'subvariety', 'varietas', # relative order of all these terms is unclear
369							'morph', 'forma', 'forma specialis',
370							'isolate', 'strain',
371							'pathogroup', 'serogroup', 'serotype',
372							'biotype', 'genotype',
373
374							'clade', 'no rank', # both terms can appear anywhere in a lineage
375							);
376
377							sub _build_rank_for {
378							#### in _build_rank_for
379	0			0			my $i = 1;
380	0						return { map { $_ => $i-- } @LEVELS };
	0
381							}
382
383							sub _build_strain_taxid_for {
384	0			0			my $self = shift;
385
386							#### in _build_strain_taxid_for
387
388							# open pipe from (possibly) multiple names.dmp
389							# TODO: try to avoid code duplication with _build_ncbi_tax
390	0						my $names_txid = file($self->tax_dir, 'names*.dmp' );
391	0						my $names_gca = file($self->tax_dir, 'gca*names.dmp');
392
393							# Note: taxid files are loaded last to get precedence over gca numbers
394	0						my @names_files = ($names_gca, $names_txid);
395	0						open my $names_fh, '-\|', "cat @names_files";
396
397	0						my %strain_taxid_for;
398
399							LINE:
400	0						while (my $line = <$names_fh>) {
401	0						chomp $line;
402
403							# fetch taxid and organism name
404	0						my ($taxid, $org) = split /\s\\|\s/xms, $line;
405
406							# extract components from organism name
407	0						my ($genus, $species, $strain) = SeqId->parse_ncbi_name($org);
408
409							# skip entries without strain
410	0	0					next LINE unless $strain;
411
412							# store tax_id for org and strain combo
413	0						my $org_key = _make_hashkey($genus, $species);
414	0						my $strain_key = _clean_strain($strain);
415	0	0	0				$strain_taxid_for{$org_key}{$strain_key} = $taxid # avoid bug with
416							if defined $strain_key and length $strain_key; # NW aligner
417							}
418
419	0						return \%strain_taxid_for;
420							}
421
422							sub _clean_strain {
423	0			0			my $strain = shift;
424
425							# remove unwanted prefices and characters (if any)
426	0						$strain =~ s{\b substr \b}{}xmsgi;
427	0						$strain =~ s{\b strain \b}{}xmsgi;
428	0						$strain =~ s{\b subsp \b}{}xmsgi;
429	0						$strain =~ s{\b str \b}{}xmsgi;
430	0						$strain =~ tr/A-Za-z0-9//cd; # delete non-alphanumeric chars
431
432	0						return lc $strain;
433							}
434
435							sub _make_hashkey {
436	0			0			my $genus = shift;
437	0						my $species = shift;
438
439	0						$species =~ tr/-//d; # clean hyphens in species
440	0						my $org = lc( $genus . q{ } . $species );
441
442	0						return $org;
443							}
444
445
446							const my $MATCHYOU => 3; # cannot use $MATCH as it is reserved
447							const my $MISMATCH => -1;
448							const my $GAP_OPEN => -2;
449							const my $GAP_EXTEND => 0;
450
451							sub _build_matcher {
452	0			0			my $self = shift;
453
454							#### in _build_matcher
455
456	0						local $_ = q{}; # to avoid loosing $_ due to Algorithm::... constructor
457
458							# setup Algorithm::NeedlemanWunsch object
459	0						my $matcher = Algorithm::NeedlemanWunsch->new( \&_matcher_matrix );
460	0						$matcher->gap_open_penalty( $GAP_OPEN );
461	0						$matcher->gap_extend_penalty($GAP_EXTEND);
462
463	0						return $matcher;
464							}
465
466							const my $GAP_CODE => '*';
467							const my $MATCH_CODE => '.';
468							const my $MISMATCH_CODE => ':';
469
470							const my %SCORE_FOR => (
471							$GAP_CODE => $GAP_OPEN,
472							$MATCH_CODE => $MATCHYOU,
473							$MISMATCH_CODE => $MISMATCH,
474							);
475
476							sub _matcher_matrix { ## no critic (RequireArgUnpacking)
477	0			0			return $SCORE_FOR{ _match_code(@_) };
478							}
479
480							sub _match_code { ## no critic (RequireArgUnpacking)
481	0	0		0			return $GAP_CODE unless @_;
482
483							# if one is not \w+ and the other is \w+ you want it to be a gap, not a
484							# mismatch --> higher penalty
485	0	0	0				if ( $_[0] =~ m/ [^A-Za-z0-9] /xms && $_[1] =~ m/ [A-Za-z0-9] /xms
			0
			0
486							or $_[0] =~ m/ [A-Za-z0-9] /xms && $_[1] =~ m/ [^A-Za-z0-9] /xms ) {
487	0						return $GAP_CODE;
488							}
489
490	0	0	0				if ( $_[0] =~ m/ [A-Za-z0-9] /xms \|\| $_[1] =~ m/ [A-Za-z0-9] /xms ) {
491	0	0					return ( lc $_[0] eq lc $_[1] ) ? $MATCH_CODE : $MISMATCH_CODE;
492							}
493
494	0						return $MATCH_CODE;
495							}
496
497							## use critic
498
499
500							# additional accessor methods (w.r.t. Bio::LITE::Taxonomy)
501							# Since Bio::LITE::Taxonomy methods return arrays (and not ArrayRefs), these
502							# methods also use arrays as in/out data structures (for consistency).
503							# Not quite... We use wantarray when needed now.
504
505							around qr{ _from_seq_id \z \| _from_legacy_seq_id \z }xms => sub {
506							my $method = shift;
507							my $self = shift;
508							my $seq_id = shift;
509
510							# coerce plain strings to SeqId...
511							# ... but do not touch (stringified) lineages (and SeqIds)
512							match_on_type $seq_id => (
513							'Bio::MUST::Core::Types::Lineage' => sub { },
514							'Str' => sub {
515							$seq_id = SeqId->new( full_id => $seq_id )
516							},
517							=> sub { },
518							);
519
520							return $self->$method($seq_id, @_);
521							};
522
523							around qr{ _from_name \z }xms => sub {
524							my $method = shift;
525							my $self = shift;
526							my $name = shift;
527
528							# avoid issue with undefined names
529							return undef unless $name; ## no critic (ProhibitExplicitReturnUndef)
530
531							if ( $self->is_dupe($name) ) {
532							carp "[BMC] Warning: $name is taxonomically ambiguous;"
533							. ' returning undef!';
534							return undef; ## no critic (ProhibitExplicitReturnUndef)
535							}
536
537							return $self->$method($name);
538							};
539
540							around qw( get_taxonomy get_taxonomy_with_levels get_term_at_level ) => sub {
541							my $method = shift;
542							my $self = shift;
543							my $taxon_id = shift;
544
545							# update taxon_id if merged in current version of NCBI Taxonomy
546							# in contrast, we don't do anything if taxon_id has been deleted
547							if ( defined $taxon_id && $self->is_merged($taxon_id) ) {
548							my $msg = "[BMC] Warning: merged taxid for $taxon_id;";
549							$taxon_id = $self->merged_for($taxon_id);
550							carp "$msg using $taxon_id instead!";
551							}
552
553							return $self->$method($taxon_id, @_);
554							};
555
556
557							sub get_taxid_from_seq_id {
558							my $self = shift;
559							my $seq_id = shift;
560
561							# 0. return undef in case of tagged contaminant sequences
562							# TODO: return taxonomy of contaminant 'tail' if any?
563							return undef ## no critic (ProhibitExplicitReturnUndef)
564							if $seq_id->is_doubtful;
565
566							# 1. first handles valid MUST strain names that look like taxon_ids
567							# Note1: 'valid' ids are the opposite to 'foreign_ids'
568							# Note2: such cases are keyed by their full_org names
569							unless ( $seq_id->is_foreign ) {
570							my $taxon_id = $self->misleading_for( $seq_id->full_org );
571							return $taxon_id if $taxon_id;
572							}
573
574							# 2. default path is to use already parsed taxon_id
575							# this applies to modern MUST SeqIds and taxonomy-aware abbr ids
576							return $seq_id->taxon_id
577							if $seq_id->taxon_id;
578
579							# 3. handles foreign_ids:
580							# ... if a GI number is available then use the GI-to-taxid mapper
581							# ... otherwise use full_id as a literal NCBI Taxonomy name
582							if ( $seq_id->is_foreign ) {
583							return $self->get_taxid_from_gi( $seq_id->gi )
584							if $seq_id->gi;
585							return $self->get_taxid_from_name( $seq_id->full_id )
586							}
587
588							# back to valid MUST ids
589							# idea: use as much information as possible but in case of failure...
590							# ... retry after droppping the most specific piece (greedy behavior)
591
592							# 4. tries to recover MUST legacy strain in NCBI Taxonomy...
593							if ( $seq_id->strain ) {
594							my $taxon_id = $self->get_taxid_from_legacy_seq_id($seq_id);
595							return $taxon_id if $taxon_id;
596							}
597
598							# 5. tries to recover organism binomial in NCBI Taxonomy...
599							unless ( $seq_id->is_genus_only ) {
600							my $taxon_id = $self->get_taxid_from_name( $seq_id->org );
601							return $taxon_id if $taxon_id;
602							}
603
604							# 6. tries to recover organism genus in NCBI Taxonomy
605							# this may fail if genus is ambiguous (see around method modifier)
606							return $self->get_taxid_from_name( $seq_id->genus );
607							}
608
609
610							sub get_taxid_from_legacy_seq_id {
611							my $self = shift;
612							my $seq_id = shift;
613
614							# fetch organism name
615							my $org_key = _make_hashkey( $seq_id->genus, $seq_id->species );
616
617							# fetch and clean organism (query) strain
618							my $query_strain = $seq_id->strain;
619							return undef ## no critic (ProhibitExplicitReturnUndef)
620							unless $query_strain;
621							$query_strain = lc SeqId->clean_strain($query_strain);
622
623							# fetch hash containing 'strain => taxid' pairs
624							# original clean code using native delegation:
625							# my $taxid_for = $self->get_strain_taxid_for($org_key);
626							# new hackish code for performance only:
627							my $taxid_for = $self->_strain_taxid_for->{$org_key};
628							return undef ## no critic (ProhibitExplicitReturnUndef)
629							unless $taxid_for;
630
631							# check if strain exist 'as is'
632							my $taxid = $taxid_for->{$query_strain};
633							return $taxid
634							if $taxid;
635
636							# fetch subject strains from hash in lexical order
637							my @sbjct_strains = sort { $a cmp $b } keys %{$taxid_for};
638
639							# split query and subject strains
640							my @split_query_strain = split //xms, $query_strain;
641							my @split_sbjct_strains = map { [ split //xms ] } @sbjct_strains;
642
643							# get score for each alignment through _matcher attribute
644							my @scores = map {
645							$self->nw_score( \@split_query_strain, $_ )
646							} @split_sbjct_strains;
647
648							# get strain with max_score
649							my $max_score = List::AllUtils::max @scores;
650							my $max_index = firstidx { $_ == $max_score } @scores;
651							my $max_strain = $sbjct_strains[$max_index];
652
653							# set heuristic threshold score based on shortest string len
654							my $threshold = List::AllUtils::min(
655							length $query_strain, length $max_strain
656							) * $MATCHYOU + $GAP_OPEN;
657
658							# return strain taxid if above threshold
659							return undef ## no critic (ProhibitExplicitReturnUndef)
660							if $max_score < $threshold;
661							return $taxid_for->{$max_strain};
662							}
663
664
665							sub get_taxonomy_from_seq_id {
666							my $self = shift;
667							my $seq_id = shift;
668
669							return match_on_type $seq_id => (
670
671							# regular case: $seq_id is a Bio::MUST::Core::SeqId
672							'Bio::MUST::Core::SeqId' => sub {
673							$self->_taxonomy_from_seq_id_(0, $seq_id);
674							},
675
676							# optimizations for 'seq_ids' already being lineages (e.g., LCAs)
677							# examine context for returning plain array or ArrayRef
678
679							# case 1: stringified lineage that must be split on semicolons
680							'Bio::MUST::Core::Types::Lineage' => sub {
681							my @taxonomy = split qr{;\ *}xms, $seq_id; # q{;} or q{; }
682							wantarray ? @taxonomy : \@taxonomy;
683							},
684
685							# case 2: ArrayRef that just must be dereferenced
686							ArrayRef => sub {
687							wantarray ? @{$seq_id} : $seq_id
688							},
689							);
690							}
691
692
693							sub fetch_lineage { ## no critic (RequireArgUnpacking)
694	0			0	1		return shift->get_taxonomy_from_seq_id(@_);
695							}
696
697
698							sub get_taxid_from_taxonomy { ## no critic (RequireArgUnpacking)
699	0			0	1		my $self = shift;
700
701							# fetch taxonomy from args using match_on_type strategy (see above)
702							# Note: this could also work from a seq_id but (this would be inefficient)
703	0						my @taxonomy = $self->get_taxonomy_from_seq_id(@_);
704
705	0						while (@taxonomy) {
706
707							# first try to get taxon_id from last taxon
708	0						my $taxon = $taxonomy[-1];
709	0						my $taxon_id = $self->get_taxid_from_name($taxon);
710	0	0					return $taxon_id if $taxon_id;
711
712							# then try to disambiguate duplicate taxa
713	0						my $dupes_for = $self->dupes_for($taxon);
714	17			17		174	no warnings 'uninitialized'; # avoid undef due to 2-4-level lineages
	17					51
	17					1256
715	0						$taxon_id = $dupes_for->{ join q{; }, @taxonomy[-5..-1] };
716	17			17		141	use warnings;
	17					70
	17					131608
717	0	0					if ($taxon_id) {
718	0						carp "[BMC] Note: managed to disambiguate $taxon based on lineage!";
719	0						return $taxon_id;
720							}
721
722							# finally retry with previous (= higher) taxon
723							# Note: this should never be used for NCBI lineages
724							# but for, e.g., SILVA lineages where (triplet) taxa may be different
725	0						pop @taxonomy;
726	0	0					carp "[BMC] Note: trying to identify $taxon by following lineage..."
727							if @taxonomy;
728							}
729
730	0						return undef; ## no critic (ProhibitExplicitReturnUndef)
731							}
732
733
734							sub get_taxonomy_with_levels_from_seq_id { ## no critic (RequireArgUnpacking)
735							return shift->_taxonomy_from_seq_id_(1, @_);
736							}
737
738
739							sub _taxonomy_from_seq_id_ {
740	0			0			my $self = shift;
741	0						my $levels = shift;
742	0						my $seq_id = shift;
743
744							# try to fetch taxon_id and corresponding taxonomy
745	0						my $taxon_id = $self->get_taxid_from_seq_id($seq_id);
746	0	0					my @taxonomy = $levels ? $self->get_taxonomy_with_levels($taxon_id)
747							: $self->get_taxonomy($taxon_id);
748
749							# workaround for 'return undef' constructs in Bio::LITE::Taxonomy
750							# ... to ensure that no (undef) list is returned by our additional methods
751							# Note: our policy is thus unlike get_taxonomy and get_taxid_from_name
752	0	0					@taxonomy = () unless $taxonomy[0];
753	0	0	0				carp '[BMC] Warning: cannot fetch tax for ' . ( $seq_id->full_id \|\| q{''} )
754							. '; ignoring it!' unless @taxonomy;
755
756							# examine context for returning plain array or ArrayRef
757	0	0					return wantarray ? @taxonomy : \@taxonomy;
758							}
759
760
761							sub get_taxa_from_taxid { ## no critic (RequireArgUnpacking)
762	0			0	1		my $self = shift;
763	0						my $taxon_id = shift;
764
765							# TODO: consider numbered levels as in fetch-tax.pl?
766	0						my @taxa = map { $self->get_term_at_level($taxon_id, $_) } @_;
	0
767	0	0					return wantarray ? @taxa : \@taxa; # specify level through currying
768							}
769
770
771							sub get_nexus_label_from_seq_id {
772							my $self = shift;
773							my $seq_id = shift;
774							my $args = shift // {}; # HashRef (should not be empty...)
775
776							# try to fetch organism and accession (or GI number as a fallback)
777							# TODO: allow for choosing between acc/gi when both are available
778							my @lineage = $self->get_taxonomy_from_seq_id($seq_id);
779							my $org = pop @lineage
780							// $seq_id->full_org( q{ } ); # use space before strain
781							my $acc = $args->{append_acc} ? $seq_id->accession // $seq_id->gi : undef;
782
783							# build label according to available pieces
784							# Note: fallback to full_id for foreign ids (undefined org)
785							my $label = defined $org && defined $acc ? "$org [$acc]"
786							: defined $org ? $org
787							: $seq_id->full_id
788							;
789
790							return SeqId->new(full_id => $label)->nexus_id;
791							}
792
793
794							# clade analysis methods
795
796
797							sub get_common_taxonomy_from_seq_ids { ## no critic (RequireArgUnpacking)
798	0			0	1		my $self = shift;
799	0						my @seq_ids = @_;
800
801							# setup optional threshold (if any) to first argument
802							# Note: this approach is used to preserved backwards compatibility
803							# Note: even if SeqId->full_id is a number the test works as expected
804	0	0					my $threshold = looks_like_number $seq_ids[0] ? shift @seq_ids : 1.0;
805
806							# fetch lineages for all seq ids
807							my @lineages = map {
808	0						scalar $self->get_taxonomy_from_seq_id($_) # note the scalar context
	0
809							} @seq_ids;
810
811							# compute common lineage
812	0						return $self->_common_taxonomy($threshold, @lineages);
813							}
814
815
816							sub compute_lca { ## no critic (RequireArgUnpacking)
817	0			0	1		return shift->get_common_taxonomy_from_seq_ids(@_);
818							}
819
820							sub _common_taxonomy { ## no critic (RequireArgUnpacking)
821	0			0			my $self = shift;
822	0						my $threshold = shift;
823	0						my @lineages = @_;
824
825							# ignore missing lineages
826							# TODO: decide whether missing lineages should be taken into account
827	0	0					@lineages = grep { @{$_} ? $_ : () } @lineages;
	0
	0
828
829	0						my @common_lineage;
830
831							# original version: strict consensus
832							# Note the use of a dynamically-sized multiple ArrayRef iterator
833							# to walk down all lineages in parallel
834							# my $ea = each_arrayref(@lineages);
835							#
836							# TAXON:
837							# while ( my @taxa = $ea->() ) {
838							# no warnings 'uninitialized'; # avoid undef due to longer lineages
839							# last TAXON if uniq(@taxa) > 1;
840							# push @common_lineage, shift @taxa;
841							# }
842
843							# current version: threshold-based majority-rule consensus
844							# algorithm: at each taxonomic rank count all seen taxa
845							# if the most popular taxon is seen > threshold (w.r.t. all lineages)
846							# then continue with the lineages featuring it
847							# otherwise stop at previous taxon
848	0						my $n = @lineages;
849
850							TAXON:
851	0						for (my $i = 0; $n; $i++) {
852	0		0	0			my %count_for = count_by { $_->[$i] // q{} } @lineages;
	0
853	0			0			my $taxon = max_by { $count_for{$_} } keys %count_for;
	0
854	0	0					last TAXON unless $taxon;
855	0						my $taxon_n = $count_for{$taxon};
856	0	0					last TAXON if $taxon_n / $n < $threshold;
857	0						push @common_lineage, $taxon;
858	0		0				@lineages = grep { ( $_->[$i] // q{} ) eq $taxon } @lineages;
	0
859							}
860
861							# examine context for returning plain array or ArrayRef
862	0	0					return wantarray ? @common_lineage : \@common_lineage;
863							}
864
865							# tree annotation methods
866
867
868							sub attach_taxonomies_to_terminals {
869	0			0	1		my $self = shift;
870	0						my $tree = shift;
871
872							# store tip taxonomies in Bio::Phylo::Forest::Node generic attributes
873	0						for my $tip ( @{ $tree->tree->get_terminals } ) {
	0
874
875							# fetch taxonomy (and level list) from tip's seq id
876	0						my @tax = $self->get_taxonomy_with_levels_from_seq_id($tip->get_name);
877
878							# attach them as distinct ArrayRefs
879	0						$tip->set_generic('taxonomy' => [ map { $_->[0] } @tax ] );
	0
880	0						$tip->set_generic('levels' => [ map { $_->[1] } @tax ] );
	0
881
882							# Note: levels are needed for robust clade naming and collapsing.
883							# Indeed, get_level_from_name() can return incorrect level when a name
884							# exists at more than one taxonomic level in NCBI Taxonomy (e.g.,
885							# Bacteria, which is also an insect genus).
886							}
887
888	0						return;
889							}
890
891
892
893							sub attach_taxonomies_to_internals {
894	0			0	1		my $self = shift;
895	0						my $tree = shift;
896
897							# post-order tree traversal
898							$tree->tree->visit_depth_first(
899
900							# infer common taxonomy from direct children of each node
901							-post => sub {
902	0			0			my $node = shift;
903	0	0					return if $node->is_terminal;
904
905							# collect children lineages
906							# ... and track the longest (better to be safe...) level list
907	0						my @lineages;
908							my @max_levels;
909	0						for (my $i = 0; my $child = $node->get_child($i); $i++) {
910	0						push @lineages, $child->get_generic('taxonomy');
911	0						my @levels = @{ $child->get_generic('levels') };
	0
912	0	0					@max_levels = @levels if @levels > @max_levels;
913							}
914
915							# store into current node their common lineage
916							# ... and cut down level list to match this common lineage
917	0						my @common_lineage = $self->_common_taxonomy(1.0, @lineages);
918	0						$node->set_generic('taxonomy' => \@common_lineage);
919	0						$node->set_generic(
920							'levels' => [ @max_levels[0..$#common_lineage] ]
921							);
922
923	0						return;
924							},
925	0						);
926
927	0						return;
928							}
929
930
931
932							sub attach_taxa_to_entities {
933	0			0	1		my $self = shift;
934	0						my $tree = shift;
935	0		0				my $args = shift // {}; # HashRef (should not be empty...)
936
937							# setup minimal taxonomic level(s) to reach
938							# Note 1: only named levels can be selected (see list above)
939							# Note 2: thus, 'no rank' results in the lowest possible taxon
940							# Note 3: default values are provided
941	0						my %arg_levels = ( name => 'no rank', collapse => 'skip', %{$args} );
	0
942							my %target_for = map {
943	0						$_ => $self->rank_for( $arg_levels{$_} )
	0
944							} keys %arg_levels;
945
946							# ensure meaningful subtree collapsing WRT naming
947	0	0					if ($target_for{collapse} < $target_for{name}) {
948	0						carp '[BMC] Warning: collapsing level must include naming level;'
949							. ' upgrading!';
950	0						$target_for{collapse} = $target_for{name};
951							}
952
953							# update both terminal and internal nodes
954							# attach lowest taxon with level higher or equal to target level(s)
955
956							NODE:
957	0						for my $node ( @{ $tree->tree->get_entities } ) {
	0
958
959							# reset taxon for robustness
960	0						my $is_named;
961	0						$node->set_generic('taxon' => undef);
962	0						$node->set_generic('taxon_collapse' => undef);
963
964							# walk up node lineage
965	0						my @lineage = @{ $node->get_generic('taxonomy') };
	0
966	0						my @levels = @{ $node->get_generic('levels') };
	0
967	0						while (my $taxon = pop @lineage) {
968
969							# OLD VERSION DERIVING LEVEL FROM TAXON NAME
970							# ... if taxon was the highest in the lineage then rank it at top
971							# Note: this is needed to preserve 'cellular organisms'
972							# my $rank = @lineage == 0 ? 0 # 0 means 'top rank'
973							# : $self->rank_for( $self->get_level_from_name($taxon) );
974
975							# get rank for taxon
976	0						my $rank = $self->rank_for( pop @levels );
977
978							# name subtree at target level or above
979	0	0	0				if (!$is_named && $rank >= $target_for{name}) {
980	0						$node->set_generic('taxon' => $taxon);
981	0						$is_named = 1; # to get the lowest possible taxon
982							} # ... while keeping walking up to collapse at a higher level
983
984							# collapse subtree only at target level
985	0	0					if ($rank == $target_for{collapse}) {
986	0						$node->set_generic('taxon_collapse' => $taxon);
987	0						next NODE; # can stop walking up now!
988							}
989							}
990							}
991
992	0						return;
993							}
994
995
996							# Listable IdList and IdMapper factory methods
997
998							# TODO: decide on whether this kind of check is needed and where...
999
1000							# around qr{ \A taxonomic_ }xms => sub {
1001							# my $method = shift;
1002							# my $self = shift;
1003							# my $listable = shift;
1004							#
1005							# # ensure the object is listable (we could also test for the role)
1006							# unless ( $listable->can('all_seq_ids') ) {
1007							# carp 'Cannot build list/mapper from ' . ref($listable) . '; aborting!';
1008							# return;
1009							# }
1010							#
1011							# return $self->$method($listable, @_);
1012							# };
1013
1014							# TODO: remove support for GI numbers
1015
1016
1017							sub gi_mapper {
1018	0			0	1		my $self = shift;
1019	0						my $listable = shift;
1020
1021							return IdMapper->new(
1022							long_ids => [ $self->_taxids_from_gis($listable) ],
1023	0						abbr_ids => [ map { $_->full_id } $listable->all_seq_ids ],
	0
1024							);
1025							}
1026
1027							const my $BATCH_SIZE => 252;
1028							const my $GOLDEN_RATIO => 1.618;
1029							const my $MAX_ATTEMPT => 3;
1030
1031							sub _taxids_from_gis {
1032	0			0			my $self = shift;
1033	0						my $listable = shift;
1034
1035							# get full_ids and GI numbers
1036	0						my @seq_ids = $listable->all_seq_ids;
1037	0						my @full_ids = map { $_->full_id } @seq_ids;
	0
1038	0						my @gis = map { $_->gi } @seq_ids;
	0
1039	0						### GIs to process: scalar @gis
	0
1040	0						my @uniq_gis = uniq @gis;
1041	0						### Unique GIs to process: scalar @uniq_gis
	0
1042
1043							# try to associate taxon_ids to GIs using binary GI-to-taxid mapper
1044	0						my %taxid_for;
1045							try {
1046	0			0			%taxid_for = map { $_ => $self->get_taxid_from_gi($_) } @uniq_gis;
	0
1047							}
1048							catch {
1049	0			0			### Note: 'Cannot load binary GI-to-taxid mapper'
	0
1050	0						};
1051
1052							# collect missing taxon_ids
1053	0						my @miss_gis = grep { not $taxid_for{$_} } @uniq_gis;
	0
1054
1055	0	0					if (@miss_gis) {
1056	0						### Unique GIs missing from binary GI-to-taxid mapper: scalar @miss_gis
	0
1057	0						### Fetching corresponding taxon_ids online instead...
	0
1058							}
1059
1060	0						my %miss_taxid_for;
1061	0						my $batch_size = $BATCH_SIZE;
1062	0						my $attempt = 1;
1063
1064	0						ESUMMARY:
1065	0						while (@miss_gis) { ### Iterating on missing GIs \|===[%] \|
	0
1066	0						my $total = @miss_gis;
1067	0						$batch_size = List::AllUtils::min($batch_size, $total);
1068							#### $total
1069							#### $batch_size
1070							#### $attempt
1071
1072							BATCH:
1073	0						for (my $i = 0; $i < $total; $i += $batch_size) {
1074	0						my $end = List::AllUtils::min($i + $batch_size, $#miss_gis);
1075							#### $i
1076							#### $end
1077
1078							# fetch eSummaries for current batch of GI numbers
1079	0						my $report = get(
1080							'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'
1081							. 'db=protein&id=' . join ',', @miss_gis[$i..$end]
1082							);
1083
1084							# stop using current batch size if no answer from NCBI servers
1085	0	0					last BATCH unless $report;
1086
1087							# parse XML report into hash tree
1088	0						my $ob = XML::Bare->new( text => $report );
1089	0						my $root = $ob->parse();
1090
1091							# capture GI number/taxon_id pairs for each eSummary of the report
1092	0						for my $docsum ( @{ $root->{eSummaryResult}{DocSum} } ) {
	0
1093	0						my @fields = @{ $docsum->{Item} };
	0
1094	0			0			my $gi = first { $_->{Name}->{value} eq 'Gi' } @fields;
	0
1095	0			0			my $taxid = first { $_->{Name}->{value} eq 'TaxId' } @fields;
	0
1096	0						$miss_taxid_for{ $gi->{value} } = $taxid->{value};
1097							}
1098							}
1099
1100							# update hash with retrieved taxon_ids
1101	0						%taxid_for = (%taxid_for, %miss_taxid_for);
1102
1103							# collect anew missing taxon_ids
1104	0						@miss_gis = grep { not $taxid_for{$_} } @uniq_gis;
	0
1105
1106							# if some taxon_ids are still missing...
1107	0	0					if (@miss_gis) {
1108
1109							# reduce batch_size after MAX_ATTEMPT attempts
1110	0	0					if (++$attempt > $MAX_ATTEMPT) {
1111	0						$batch_size = floor($batch_size / $GOLDEN_RATIO);
1112	0						$attempt = 1;
1113							}
1114
1115							# stop trying if batch_size falls to zero
1116	0	0					unless ($batch_size) {
1117	0						carp '[BMC] Warning: no answer from NCBI E-utilities;'
1118							. ' dropping GIs!';
1119	0						last ESUMMARY;
1120							}
1121							}
1122							}
1123
1124	0						### Unique GIs retrieved online: scalar keys %miss_taxid_for
	0
1125	0						### Unique GIs still missing: scalar @miss_gis
	0
1126
1127							# build new ids by prepending taxon_ids to full_ids
1128	0						my @taxids_gis;
1129	0						my $ea = each_array(@full_ids, @gis);
1130	0						while ( my ($full_id, $gi) = $ea->() ) {
1131	0						push @taxids_gis, $taxid_for{$gi} . '\|' . $full_id;
1132							}
1133
1134	0						return @taxids_gis;
1135							}
1136
1137
1138
1139							sub tab_mapper {
1140	0			0	1		my $self = shift;
1141	0						my $infile = shift;
1142	0		0				my $args = shift // {};
1143
1144	0		0				my $col = $args->{column} // 1;
1145	0		0				my $sep = $args->{separator} // qr{\t}xms;
1146	0						my $idm = $args->{gi2taxid};
1147
1148	0						open my $in, '<', $infile;
1149
1150	0						tie my %family_for, 'Tie::IxHash';
1151
1152							LINE:
1153	0						while ( my $line = <$in>) {
1154	0						chomp $line;
1155
1156							# skip empty lines and comment lines
1157	0	0	0				next LINE if $line =~ $EMPTY_LINE
1158							\|\| $line =~ $COMMENT_LINE;
1159
1160	0						my @fields = split $sep, $line;
1161	0						$family_for{ $fields[0] } = $fields[$col];
1162							}
1163
1164	0						my @abbr_ids = keys %family_for;
1165	0						my @must_ids = @abbr_ids;
1166
1167	0	0					if ($idm) {
1168
1169							# build gi2taxid 'mapper'
1170	0						my $gi_mapper = IdMapper->load($idm);
1171	0						my @gis = map { $_->gi } $gi_mapper->all_abbr_seq_ids;
	0
1172	0						my @taxon_ids = map { $_->taxon_id } $gi_mapper->all_long_seq_ids;
	0
1173	0						my %taxid_for = mesh @gis, @taxon_ids;
1174
1175							# build modern seq_ids from GIs using 'mapper'
1176	0						for my $id (@must_ids) {
1177	0						my $taxon_id = $taxid_for{ SeqId->new( full_id => $id )->gi };
1178	0						my @taxonomy = $self->get_taxonomy($taxon_id);
1179	0						my $org = $taxonomy[-1];
1180	0						$id = SeqId->new_with(
1181							org => $org,
1182							taxon_id => $taxon_id,
1183							gi => $id, # actually not a 'pure' GI here
1184							)->full_id;
1185							}
1186							}
1187
1188	0						my @long_ids;
1189
1190	0						my $ea = each_array @must_ids, @abbr_ids;
1191	0						while ( my ($must_id, $abbr_id) = $ea->() ) {
1192	0		0				( my $family = $family_for{$abbr_id} // q{} ) =~ tr/- @_/./;
1193	0	0					$family .= '-' if $family;
1194	0						push @long_ids, $family . $must_id;
1195							}
1196
1197	0						return IdMapper->new(
1198							long_ids => \@long_ids,
1199							abbr_ids => \@abbr_ids,
1200							);
1201							}
1202
1203
1204
1205							sub tax_mapper { ## no critic (RequireArgUnpacking)
1206	0			0	1		my $self = shift;
1207	0						my $listable = shift;
1208
1209							return IdMapper->new(
1210							long_ids => [ map {
1211	0						$self->get_nexus_label_from_seq_id($_, @_)
1212							} $listable->all_seq_ids ],
1213	0						abbr_ids => [ map { $_->full_id } $listable->all_seq_ids ],
	0
1214							);
1215							}
1216
1217
1218
1219							sub tax_filter {
1220	0			0	1		my $self = shift;
1221	0						my $list = shift;
1222
1223	0						return Filter->new( tax => $self, _specs => $list );
1224							}
1225
1226
1227
1228							sub tax_criterion {
1229	0			0	1		my $self = shift;
1230	0						my $args = shift;
1231
1232	0						$args->{tax_filter} = $self->tax_filter( $args->{tax_filter} );
1233
1234	0						return Criterion->new($args);
1235							}
1236
1237
1238
1239							sub tax_category {
1240	0			0	1		my $self = shift;
1241	0						my $args = shift;
1242
1243							$args->{criteria} = [
1244	0						map { $self->tax_criterion($_) } @{ $args->{criteria} }
	0
	0
1245							];
1246
1247	0						return Category->new($args);
1248							}
1249
1250
1251							# Classifier/Labeler/ColorScheme factory methods
1252
1253
1254							sub tax_classifier {
1255	0			0	1		my $self = shift;
1256	0						my $args = shift;
1257
1258							$args->{categories} = [
1259	0						map { $self->tax_category($_) } @{ $args->{categories} }
	0
	0
1260							];
1261
1262	0						return Classifier->new($args);
1263							}
1264
1265							# example of input HashRef for tax_classifier
1266							# 'min', 'max' and 'description' keys are both optional
1267							# categories => [
1268							# {
1269							# criteria => [
1270							# {
1271							# max => undef,
1272							# min => 1,
1273							# tax_filter => [
1274							# '+Latimeria'
1275							# ]
1276							# },
1277							# {
1278							# tax_filter => [
1279							# '+Protopterus'
1280							# ]
1281							# },
1282							# {
1283							# tax_filter => [
1284							# '+Danio',
1285							# '+Oreochromis'
1286							# ]
1287							# },
1288							# {
1289							# tax_filter => [
1290							# '+Xenopus'
1291							# ]
1292							# },
1293							# {
1294							# tax_filter => [
1295							# '+Anolis',
1296							# '+Gallus',
1297							# '+Meleagris',
1298							# '+Taeniopygia'
1299							# ]
1300							# },
1301							# {
1302							# tax_filter => [
1303							# '+Mammalia'
1304							# ]
1305							# }
1306							# ],
1307							# description => 'strict species sampling',
1308							# label => 'strict'
1309							# },
1310							# {
1311							# criteria => [
1312							# {
1313							# tax_filter => [
1314							# '+Latimeria'
1315							# ]
1316							# },
1317							# {
1318							# tax_filter => [
1319							# '+Protopterus'
1320							# ]
1321							# },
1322							# {
1323							# tax_filter => [
1324							# '+Danio',
1325							# '+Oreochromis'
1326							# ]
1327							# },
1328							# {
1329							# tax_filter => [
1330							# '+Amphibia',
1331							# '+Amniota'
1332							# ]
1333							# }
1334							# ],
1335							# description => 'loose species sampling',
1336							# label => 'loose'
1337							# }
1338							# ]
1339
1340
1341
1342							sub tax_labeler_from_systematic_frame {
1343	0			0	1		my $self = shift;
1344	0						my $infile = shift;
1345
1346							# Thursday 26 November 2015 at 15 hours 21
1347							# ((((((Crenarchaeota:371:88:-1,Korarchaeota:371:72:-1)a:15:80:-1,...)Tree of Life:3:16:-1;
1348
1349							# ensure that we get a parseable tree from the .fra file
1350							# by considering only line 2 and turning all funny 'branch lengths' to 1
1351	0						my @lines = file($infile)->slurp;
1352	0						(my $newick_str = pop @lines) =~ s/(?: :-?(\d+) ){3}/:1/xmsg;
1353
1354							# parse tree using Bio::Phylo
1355							# Note: keep whitespace because tip labels are not between quotes
1356	0						my $tree = parse(
1357							-format => 'newick',
1358							-string => $newick_str,
1359							-keep_whitespace => 1,
1360							)->first;
1361
1362							# extract tip labels
1363	0						my @labels = map { $_->get_name } @{ $tree->get_terminals };
	0
	0
1364
1365							# build classifier from labels
1366	0						return $self->tax_labeler_from_list( \@labels );
1367							}
1368
1369
1370
1371							sub tax_labeler_from_list {
1372	0			0	1		my $self = shift;
1373	0						my $list = shift;
1374
1375	0						return Labeler->new( tax => $self, labels => $list );
1376							}
1377
1378
1379
1380							sub load_color_scheme { ## no critic (RequireArgUnpacking)
1381	0			0	1		my $self = shift;
1382	0						my $scheme = ColorScheme->new( tax => $self );
1383	0						return $scheme->load(@_);
1384							}
1385
1386
1387
1388							sub eq_tax { ## no critic (RequireArgUnpacking)
1389	0			0	1		my $self = shift;
1390	0						my $got = shift;
1391	0						my $expect = shift;
1392	0						my $classifier = shift;
1393
1394							# classify got and expect orgs
1395	0						my $got_taxon = $classifier->classify($got, @_);
1396	0						my $exp_taxon = $classifier->classify($expect, @_);
1397
1398							# use context to decide what to return
1399							# list context: return taxon labels
1400	0	0					return ($got_taxon, $exp_taxon)
1401							if wantarray;
1402
1403							# scalar context: compare taxon labels if both are defined
1404							return undef ## no critic (ProhibitExplicitReturnUndef)
1405	0	0	0				unless $got_taxon && $exp_taxon;
1406
1407	0						return $got_taxon eq $exp_taxon;
1408							}
1409
1410
1411							# I/O METHODS
1412
1413							const my $CACHEDB => 'cachedb.bin';
1414
1415
1416							sub new_from_cache { ## no critic (RequireArgUnpacking)
1417	0			0	1		my $class = shift;
1418	0						my %args = @_; # TODO: handle HashRef?
1419
1420	0						### Loading NCBI Taxonomy from binary cache file...
	0
1421	0						my $tax_dir = dir( glob $args{tax_dir} );
1422	0						my $cachefile = file($tax_dir, $CACHEDB);
1423	0						my $tax = $class->load($cachefile, inject => { tax_dir => $tax_dir } );
1424
1425	0						### Done!
	0
1426	0						return $tax;
1427							}
1428
1429
1430							sub update_cache {
1431	0			0	1		my $self = shift;
1432
1433	0						my $cachefile = file($self->tax_dir, $CACHEDB);
1434	0						### Updating binary cache file: $cachefile->stringify
	0
1435	0						$self->store($cachefile);
1436
1437	0						### Done!
	0
1438	0						return 1;
1439							}
1440
1441
1442							# class method to setup local taxonomy database
1443
1444
1445							sub setup_taxdir {
1446	0			0	1		my $class = shift;
1447	0						my $tax_dir = shift;
1448	0		0				my $args = shift // {}; # HashRef (should not be empty...)
1449
1450	0						my $source = $args->{source};
1451
1452	0	0					$class->_setup_ncbi_taxdir($tax_dir, $args)
1453							if $source eq 'ncbi';
1454
1455	0	0					$class->_setup_gtdb_taxdir($tax_dir)
1456							if $source eq 'gtdb';
1457
1458	0						return;
1459							}
1460
1461
1462							sub _setup_ncbi_taxdir {
1463	0			0			my $class = shift;
1464	0						my $tax_dir = shift;
1465	0		0				my $args = shift // {}; # HashRef (should not be empty...)
1466
1467	0		0				my $gi_mapper = $args->{gi_mapper} // 0;
1468
1469							# setup local directory
1470	0						$tax_dir = dir( glob $tax_dir );
1471	0						$tax_dir->mkpath();
1472
1473	0						### Installing NCBI Taxonomy database to: $tax_dir->stringify
	0
1474	0						### Please be patient...
	0
1475
1476							# setup remote archive access
1477	0						my $base = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy';
1478	0	0					my @targets = (
1479							'taxdump.tar.gz',
1480							$gi_mapper ? qw(gi_taxid_nucl.dmp.gz gi_taxid_prot.dmp.gz) : ()
1481							);
1482
1483							# download and install file(s)...
1484							# ... first, the taxon_id version...
1485	0						my @dmpfiles;
1486	0						for my $target (@targets) {
1487	0						my $url = "$base/$target";
1488
1489	0						### Downloading: $url
	0
1490	0						my $zipfile = file($tax_dir, $target)->stringify;
1491							# Note: stringify is required by getstore
1492	0						my $ret_code = getstore($url, $zipfile);
1493	0	0					croak "[BMC] Error: cannot download $url: error $ret_code; aborting!"
1494							unless $ret_code == 200;
1495
1496							# TODO: use modules for unarchiving (not that easy)
1497
1498	0						### Unarchiving: $zipfile
	0
1499	0	0					if ($target =~ m/\A gi_taxid/xms) { # GI mappers
1500	0						system("gzip -d $zipfile");
1501	0						push @dmpfiles, change_suffix( $zipfile, q{} );
1502							}
1503							else { # main tax archive
1504	0						system("tar -xzf $zipfile -C $tax_dir");
1505	0						file($zipfile)->remove;
1506							}
1507							}
1508
1509							# delete gca files if any
1510	0						my $gcanamefile = file($tax_dir, 'gca0-names.dmp');
1511	0						my $gcanodefile = file($tax_dir, 'gca0-nodes.dmp');
1512	0	0					$gcanamefile->remove if -e $gcanamefile;
1513	0	0					$gcanodefile->remove if -e $gcanodefile;
1514
1515							#... second, the accession_id version
1516	0						$class->_make_gca_files($tax_dir);
1517
1518							# return true on success (only check main files)
1519	0	0	0				if ( -r $gcanamefile && -r $gcanodefile ) {
1520	0						### Successfully wrote GCA-based files!
	0
1521							}
1522
1523							# delete cache if any
1524	0						my $cachefile = file($tax_dir, $CACHEDB);
1525	0	0					$cachefile->remove if -e $cachefile;
1526
1527							# optionally build binary GI mapper from GI-to-taxid flat files
1528	0	0					if ($gi_mapper) {
1529	0						my $mrgfile = file($tax_dir, 'gi_taxid_nucl_prot.dmp')->stringify;
1530	0						### Merging GI-to-taxid flat files to: $mrgfile
	0
1531	0						system("sort -nm @dmpfiles > $mrgfile");
1532	0						my $binfile = change_suffix($mrgfile, '.bin');
1533	0						### Building binary GI-to-taxid mapper: $binfile
	0
1534	0						new_dict( in => $mrgfile, out => $binfile );
1535	0						file($_)->remove for @dmpfiles, $mrgfile;
1536							}
1537
1538							# return true on success (only check main files)
1539	0	0	0				if ( -r file($tax_dir, 'names.dmp') && -r file($tax_dir, 'nodes.dmp') ) {
1540	0						### Successfully wrote taxid files!
	0
1541	0						return 1;
1542							}
1543
1544	0						### Failed installation!
	0
1545	0						return 0;
1546							}
1547
1548							sub _make_gca_files {
1549	0			0			my $class = shift;
1550	0						my $tax_dir = shift;
1551
1552	0						const my $FS => qq{\t\|\t};
1553	0						const my $FS_REGEX => qr{\t \\| \t}xms;
1554
1555	0						my %rank_for;
1556							my %parent_taxid_for;
1557
1558							# open nodes.dmp file
1559	0						my $nodes = file($tax_dir, 'nodes.dmp');
1560	0						open my $in, '<', $nodes;
1561
1562							# get taxon_id and parent_taxon_id from nodes.dmp file
1563	0						while (my $line = <$in>) {
1564	0						chomp $line;
1565	0						my ($taxon_id, $parent_taxon_id, $rank) = $line
1566							=~ m/^ (\d+) $FS_REGEX (\d+) $FS_REGEX ([^\t]+) $FS_REGEX/xmsg;
1567	0						$rank_for{$taxon_id} = $rank;
1568	0						$parent_taxid_for{$taxon_id} = $parent_taxon_id;
1569							}
1570
1571							# create helper Taxonomy object
1572	0						my $tax = $class->new( tax_dir => $tax_dir );
1573
1574							# define remote assembly reports filenames
1575							# Note: the order is significant (last files dominate over first files)
1576	0						my @targets_acc = qw(
1577							assembly_summary_genbank_historical.txt
1578							assembly_summary_refseq_historical.txt
1579							assembly_summary_genbank.txt
1580							assembly_summary_refseq.txt
1581							);
1582
1583	0						my %name_for;
1584							my %node_for;
1585	0						my %fix_for;
1586
1587	0						my $base_acc = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS';
1588
1589	0						for my $target (@targets_acc) {
1590							#### $target
1591
1592	0						my $url = "$base_acc/$target";
1593	0						my $file = file($tax_dir, $target)->stringify;
1594							#### $file
1595
1596	0						### Downloading: $url
	0
1597	0						my $ret_code = getstore($url, $file);
1598	0	0					croak "[BMC] Error: cannot download $url: error $ret_code; aborting!"
1599							unless $ret_code == 200;
1600
1601							# parse file for accession numbers and related taxon_ids
1602	0						open my $fh, '<', $file;
1603
1604							LINE:
1605	0						while (my $line = <$fh>) {
1606	0						chomp $line;
1607
1608							# skip empty lines and comment lines
1609	0	0	0				next LINE if $line =~ $EMPTY_LINE
1610							\|\| $line =~ $COMMENT_LINE;
1611
1612	0						my ($accession, $taxon_id, $species_taxon_id)
1613							= (split /\t/xms, $line)[0,5,6];
1614
1615							# update merged taxon_id (mostly from historical assembly files)
1616	0	0					$taxon_id = $tax->merged_for($taxon_id)
1617							if $tax->is_merged($taxon_id);
1618	0	0					$species_taxon_id = $tax->merged_for($species_taxon_id)
1619							if $tax->is_merged($species_taxon_id);
1620
1621							# skip deleted nodes (again mostly from historical assembly files)
1622	0	0					next LINE if $tax->is_deleted($taxon_id);
1623
1624							# fetch taxonomy and org using taxon_id
1625	0						my @taxonomy = $tax->get_taxonomy($taxon_id);
1626	0						my $org = $taxonomy[-1];
1627
1628							# use parent taxon_id if no taxon_id for strain
1629	0	0					if ($species_taxon_id == $taxon_id) {
1630	0						$species_taxon_id = $parent_taxid_for{$taxon_id};
1631	0						$fix_for{$taxon_id}{name_for} = $org;
1632	0						$fix_for{$taxon_id}{node_for} = $species_taxon_id;
1633							}
1634
1635	0						$name_for{$accession} = $org;
1636	0						$node_for{$accession} = $species_taxon_id;
1637							}
1638	0						close $fh;
1639							}
1640
1641							# write names.dmp
1642	0						my $names_gca_file = file($tax_dir, 'gca0-names.dmp');
1643	0						open my $names_out, '>', $names_gca_file;
1644
1645	0						for my $accession ( keys %name_for ) {
1646	0						say {$names_out} join $FS,
1647	0						$accession, $name_for{$accession}, q{}, 'scientific name';
1648							}
1649	0						for my $taxon_id ( keys %fix_for ) {
1650	0						say {$names_out} join $FS,
1651	0						$taxon_id, $fix_for{$taxon_id}{name_for}, q{}, 'scientific name';
1652	0						$rank_for{$fix_for{$taxon_id}{name_for}} = $rank_for{$taxon_id};
1653							}
1654
1655							# write nodes.dmp
1656	0						my $nodes_gca_file = file($tax_dir, 'gca0-nodes.dmp');
1657	0						open my $nodes_out, '>', $nodes_gca_file;
1658
1659	0						for my $accession ( keys %node_for ) {
1660	0						say {$nodes_out} join $FS, $accession, $node_for{$accession},
1661	0		0				$rank_for{$name_for{$accession}} // 'no rank';
1662							}
1663	0						for my $taxon_id ( keys %fix_for ) {
1664	0						say {$nodes_out} join $FS, $taxon_id, $fix_for{$taxon_id}{node_for},
1665	0						$rank_for{$taxon_id};
1666							}
1667
1668	0						return;
1669							}
1670
1671
1672							# http://www.ncbi.nlm.nih.gov/news/11-21-2013-strain-id-changes/
1673							#
1674							# Planned change in bacterial strain-level information management
1675							#
1676							# Please be aware that there is an upcoming change (January 2014) in how NCBI
1677							# manages organism strain information. Due to significant increases in the
1678							# volume of strain-specific sequencing, we are changing our management of
1679							# strain information.
1680							#
1681							# Next generation sequencing has already changed the way microbial genomes are
1682							# being used. The scope of microbial sequencing projects has shifted from a
1683							# single isolate representing an organism to multi-isolate and multi-species
1684							# projects representing microbial communities. Consequently, in the first nine
1685							# months of 2013 the sequences of more than 6000 prokaryotic genomes were
1686							# released by INSDC (DDBJ/ENA/GenBank).
1687							#
1688							# NCBI is introducing several changes in prokaryotic genomes and related
1689							# resources such as Assembly, BioProject, BioSample, and Taxonomy that will
1690							# affect your submissions, data downloads, analysis tools, and parsers.
1691							#
1692							# Taxonomy
1693							#
1694							# Assigning strain-level TaxID will be discontinued in January 2014 because
1695							# curation of strain-level TaxIDs will not remain possible under such growth.
1696							# However, the thousands of existing strain-level TaxIDs will remain, and we
1697							# will continue to add informal strain-specific names for genomes from
1698							# specimens that have not yet been identified to the species level, e.g.
1699							# “Rhizobium sp. CCGE 510” and “Micromonas sp. RCC299”. The strain information
1700							# will continue to be collected and displayed.
1701							#
1702							# BioSample
1703							#
1704							# Submitters of genome sequences will be required to register sample meta-data
1705							# in the BioSample database for each organism that they are sequencing. The
1706							# BioSample submission will include the strain information and other metadata,
1707							# such as culture collection and isolation information, as appropriate. The
1708							# BioSample accession will be a link on the GenBank records, and the GenBank
1709							# records themselves will display the strain in the source information.
1710							#
1711							# BioProject
1712							#
1713							# Submitters of genome sequences are already required to register meta-data
1714							# about the research project in the BioProject database. We no longer require
1715							# a one-to-one relationship between a BioProject accession and a genome.
1716							# Instead, a research effort examining multiple strains of a species or
1717							# multiple species of drug-resistant bacteria, for example, could be
1718							# registered as a single BioProject.
1719							#
1720							# Assembly
1721							#
1722							# Each genome assembly is loaded to the Assembly database and assigned an
1723							# Assembly accession. The Assembly accession is specific for a particular
1724							# genome submission.
1725							#
1726							# What defines a genome?
1727							#
1728							# A BioProject ID or accession cannot be used to define a single genome, since
1729							# many may belong to a multi-isolate or multi-species project. Furthermore, a
1730							# TaxID can no longer reliably define an individual genome since unique TaxIDs
1731							# will not be assigned for individual strains and isolates. The collection of
1732							# DNA sequences of an individual sample (isolate) will be represented by a
1733							# BioSample accession and if raw sequence reads are assembled and submitted to
1734							# GenBank they will get a unique Assembly accession. The Assembly accession is
1735							# specific for a particular genome submission. For example, sequence data
1736							# generated from a single sample (with a BioSample accession) could be
1737							# assembled with two different algorithms and so have two sets of GenBank
1738							# accessions, each with its own Assembly accession.
1739							# BioSample accession and each assembled genome has its own Assembly
1740							# accession. This BioProject includes an isolate of Listeria monocytogenes
1741							# (TaxID 1639, strain R2-502) which was registered as BioSample SAMN02203126,
1742							# and its genome is represented in GenBank records CP006595-CP006596, which
1743							# are tracked as a group in the Assembly database under accession
1744							# GCA_000438585.
1745							#
1746							# FTP files
1747							#
1748							# Genome text reports on the FTP site have been modified to include the
1749							# BioSample and Assembly accessions. These two columns were added at the end
1750							# of the tables to minimize problems for existing parsers. Initially, not all
1751							# assemblies will have a BioSample accession because we are still in the
1752							# process of back-filling BioSamples for genomes.
1753							#
1754							# These changes will occur in January 2014. We will be releasing more
1755							# information as the date approaches.
1756
1757							sub _setup_gtdb_taxdir {
1758	0			0			my $class = shift;
1759	0						my $tax_dir = shift;
1760
1761							# setup local directory
1762	0						$tax_dir = dir( glob $tax_dir );
1763	0						$tax_dir->mkpath();
1764
1765	0						### Installing GTDB Taxonomy database to: $tax_dir->stringify
	0
1766	0						### Please be patient...
	0
1767
1768							# setup remote archive access
1769	0						my $base = 'https://data.gtdb.ecogenomic.org/releases/latest';
1770	0						my @targets = (
1771							'ar122_metadata.tar.gz' , 'bac120_metadata.tar.gz',
1772							'FILE_DESCRIPTIONS', 'METHODS', 'RELEASE_NOTES', 'VERSION'
1773							);
1774
1775	0						for my $target (@targets) {
1776	0						my $url = "$base/$target";
1777
1778	0						### Downloading: $url
	0
1779	0						my $zipfile = file($tax_dir, $target)->stringify;
1780	0						my $ret_code = getstore($url, $zipfile);
1781	0	0					croak "[BMC] Error: cannot download $url: error $ret_code; aborting!"
1782							unless $ret_code == 200;
1783
1784	0	0					if ($target =~ m/metadata/xms) {
1785	0						### Unarchiving: $zipfile
	0
1786	0						system("tar -xzf $zipfile -C $tax_dir");
1787	0						file($zipfile)->remove;
1788							}
1789							}
1790
1791	0						my $arc_file = file($tax_dir, 'ar122_metadata*.tsv');
1792	0						my $bac_file = file($tax_dir, 'bac120_metadata*.tsv');
1793	0						my $new_arcfile = file($tax_dir, 'archaea_metadata.tsv');
1794	0						my $new_bacfile = file($tax_dir, 'bacteria_metadata.tsv');
1795
1796							# change file name to avoid GTDB version in basename
1797	0	0					$new_arcfile->remove if -e $new_arcfile;
1798	0	0					$new_bacfile->remove if -e $new_bacfile;
1799	0						system("mv $arc_file $new_arcfile");
1800	0						system("mv $bac_file $new_bacfile");
1801
1802							# return true on success (only check main files)
1803	0	0	0				if ( -r $new_arcfile && -r $new_bacfile ) {
1804	0						### Successfully downloaded metadata files!
	0
1805							}
1806
1807							else {
1808	0						### Failed installation!
	0
1809	0						return 0;
1810							}
1811
1812							# concatenate metadata TSV files
1813	0						my $prok_file = file($tax_dir, 'prok_metadata.tsv');
1814	0	0					$prok_file->remove if -e $prok_file;
1815	0						system("cat $new_arcfile $new_bacfile > $prok_file");
1816
1817							# create hash from metadata file
1818	0						my $table_for = _read_gtdb_metadata($prok_file);
1819
1820							# hash for rank code
1821							my %rank_for = map {
1822	0	0					$_ eq 'superkingdom' ? 'd__' : substr($_, 0, 1) . '__' => $_
	0
1823							} qw (superkingdom phylum class order family genus species);
1824
1825							# setup taxonomic tree
1826	0						my %tree = (
1827							1 => {
1828							rank => 'no rank',
1829							name => 'root',
1830							uniq_name => q{},
1831							children => {
1832							2 => {
1833							rank => 'no rank',
1834							name => 'cellular organisms',
1835							children => {},
1836							},
1837							},
1838							}
1839							);
1840
1841							# fill up taxonomic tree
1842	0						for my $gca ( keys %{$table_for} ) {
	0
1843
1844							# get GTDB taxonomy
1845	0						my $lineage = $table_for->{$gca}{'gtdb_taxonomy'};
1846	0						my @taxonomy = split q{;}, $lineage;
1847
1848	0						my $tree_ref = $tree{1}{children}{2}{children};
1849
1850							# count duplicate taxon in lineage
1851	0			0			my %count_for = count_by { substr( $_, 3, length()-1 ) } @taxonomy;
	0
1852
1853	0						while (my $gtdb_taxon = shift @taxonomy) {
1854
1855							# get taxon rank
1856	0						my ($rank_code, $taxon) = ($gtdb_taxon) =~ m/^([a-z]__)(.*)/xms;
1857	0						my $rank = $rank_for{$rank_code};
1858
1859							# create taxid for taxon
1860	0						my $taxon_id = $rank_code . lc($taxon =~ tr/A-Za-z0-9//cdr);
1861
1862							# create taxid entry (if not yet existing)
1863	0	0					unless ( $tree_ref->{$taxon_id} ) {
1864	0						$tree_ref->{$taxon_id}{name} = $taxon;
1865	0						$tree_ref->{$taxon_id}{rank} = $rank;
1866							}
1867
1868							# set a unique name (if duplicate taxon)
1869	0	0					my $uniq_name = $count_for{$taxon} > 1
1870							? $taxon . ' <' . lc $rank . '>' : q{};
1871							# ... and add it in hash
1872	0	0					$tree_ref->{$taxon_id}{uniq_name} = $uniq_name
1873							if $uniq_name;
1874
1875							# store GCA\|F
1876	0	0					if ($rank eq 'species') {
1877
1878							# change GCF to GCA and store it
1879	0	0					if ($gca =~ m/^GCF/xms) {
1880	0						(my $new_gca = $gca ) =~ s/GCF/GCA/xms;
1881	0						push @{ $tree_ref->{$taxon_id}{gca} }, $new_gca;
	0
1882							}
1883
1884	0						push @{ $tree_ref->{$taxon_id}{gca} }, $gca;
	0
1885							}
1886
1887							# setup children entry if lineage not yet exhausted
1888	0	0					if (@taxonomy) {
1889	0		0				$tree_ref->{$taxon_id}{children} //= {};
1890							$tree_ref = $tree_ref->{$taxon_id}{children}
1891	0						}
1892							}
1893							}
1894
1895							# taxid files
1896	0						my $name_file = file($tax_dir, 'names.dmp');
1897	0						my $node_file = file($tax_dir, 'nodes.dmp');
1898	0						open my $name_out, '>', $name_file;
1899	0						open my $node_out, '>', $node_file;
1900
1901							# gca files
1902	0						my $gcanamefile = file($tax_dir, 'gca0-names.dmp');
1903	0						my $gcanodefile = file($tax_dir, 'gca0-nodes.dmp');
1904	0						open my $gcaname_out, '>', $gcanamefile;
1905	0						open my $gcanode_out, '>', $gcanodefile;
1906
1907							# create empty additional dmp files
1908	0						file($tax_dir, 'delnodes.dmp')->spew;
1909	0						file($tax_dir, 'merged.dmp' )->spew;
1910
1911							# write regular dmp files (first recursive call)
1912							_write_dmp_files( $name_out, $node_out, $gcaname_out, $gcanode_out,
1913	0						1, 1, $tree{1} );
1914
1915	0	0	0				if ( -r $name_file && -r $node_file ) {
1916	0						### Successfully wrote taxid files!
	0
1917							}
1918
1919	0	0	0				if ( -r $gcanamefile && -r $gcanodefile ) {
1920	0						### Successfully wrote GCA-based files!
	0
1921							}
1922
1923	0						return;
1924							}
1925
1926							sub _read_gtdb_metadata {
1927	0			0			my $infile = shift;
1928
1929	0						open my $in, '<', $infile;
1930
1931	0						my %table_for;
1932							my @keys;
1933
1934							LINE:
1935	0						while (my $line = <$in>) {
1936	0						chomp $line;
1937
1938	0	0					if ($line =~ m/^accession/xms) {
1939	0						(undef, @keys) = split /\t/xms, $line;
1940	0						next LINE;
1941							}
1942
1943	0						my ($gca, @values) = split /\t/xms, $line;
1944	0						$gca =~ s/GB_\|RS_//xms;
1945	0						$table_for{$gca} = { mesh @keys, @values };
1946							}
1947
1948	0						return \%table_for;
1949							}
1950
1951							sub _write_dmp_files { ## no critic (ProhibitManyArgs)
1952	0			0			my ($name_out, $node_out, $gcaname_out, $gcanode_out,
1953							$taxon_id, $parent, $tree_ref) = @_;
1954
1955	0						my $name = $tree_ref->{name };
1956	0						my $rank = $tree_ref->{rank };
1957	0		0				my $uniq_name = $tree_ref->{uniq_name} // q{};
1958	0		0				my $gcas = $tree_ref->{gca } // [];
1959
1960							# write gca files
1961	0	0					if ($gcas) {
1962							_append2dmp($gcaname_out, $gcanode_out,
1963							$_, $parent, $name, $rank, $uniq_name)
1964	0						for sort @{$gcas};
	0
1965							}
1966
1967							# write taxid files
1968	0						_append2dmp($name_out, $node_out,
1969							$taxon_id, $parent, $name, $rank, $uniq_name);
1970
1971	0						my $children_for = $tree_ref->{children};
1972	0	0					return unless $children_for;
1973
1974	0						$parent = $taxon_id;
1975
1976							_write_dmp_files( $name_out, $node_out, $gcaname_out, $gcanode_out,
1977							$_, $parent, $children_for->{$_} )
1978	0						for sort keys %{$children_for};
	0
1979
1980	0						return;
1981							}
1982
1983							sub _append2dmp { ## no critic (ProhibitManyArgs)
1984	0			0			my ($name_out, $node_out,
1985							$taxon_id, $parent, $name, $rank, $uniq_name) = @_;
1986
1987							# write names file
1988	0						say {$name_out} join "\t",
	0
1989							$taxon_id , '\|',
1990							$name , '\|',
1991							$uniq_name, '\|',
1992							'scientific name'
1993							;
1994
1995							# write nodes file
1996	0						say {$node_out} join "\t",
	0
1997							$taxon_id, '\|', $parent, '\|', $rank,
1998							( join "\t", '\|', q{} ) x 10;
1999
2000	0						return;
2001							}
2002
2003	17			17		212	no Moose::Util::TypeConstraints;
	17					55
	17					237
2004
2005							__PACKAGE__->meta->make_immutable;
2006							1;
2007
2008							__END__
2009
2010							=pod
2011
2012							=head1 NAME
2013
2014							Bio::MUST::Core::Taxonomy - NCBI Taxonomy one-stop shop
2015
2016							=head1 VERSION
2017
2018							version 0.212670
2019
2020							=head1 SYNOPSIS
2021
2022							# TODO
2023
2024							=head1 DESCRIPTION
2025
2026							# TODO
2027
2028							=head1 METHODS
2029
2030							=head2 get_taxid_from_seq_id
2031
2032							=head2 get_taxid_from_legacy_seq_id
2033
2034							=head2 get_taxonomy_from_seq_id
2035
2036							=head2 get_taxid_from_taxonomy
2037
2038							=head2 get_taxonomy_with_levels_from_seq_id
2039
2040							=head2 get_taxa_from_taxid
2041
2042							=head2 get_nexus_label_from_seq_id
2043
2044							=head2 get_common_taxonomy_from_seq_ids
2045
2046							=head2 attach_taxonomies_to_terminals
2047
2048							=head2 attach_taxonomies_to_internals
2049
2050							=head2 attach_taxa_to_entities
2051
2052							=head2 gi_mapper
2053
2054							=head2 tab_mapper
2055
2056							=head2 tax_mapper
2057
2058							=head2 tax_filter
2059
2060							=head2 tax_criterion
2061
2062							=head2 tax_category
2063
2064							=head2 tax_classifier
2065
2066							=head2 tax_labeler_from_systematic_frame
2067
2068							=head2 tax_labeler_from_list
2069
2070							=head2 load_color_scheme
2071
2072							=head2 eq_tax
2073
2074							=head2 setup_taxdir
2075
2076							=head1 I/O METHODS
2077
2078							=head2 new_from_cache
2079
2080							=head2 update_cache
2081
2082							=head1 ALIASES
2083
2084							=head2 fetch_lineage
2085
2086							=head2 compute_lca
2087
2088							=head1 AUTHOR
2089
2090							Denis BAURAIN <denis.baurain@uliege.be>
2091
2092							=head1 CONTRIBUTORS
2093
2094							=for stopwords Loic MEUNIER Mick VAN VLIERBERGHE
2095
2096							=over 4
2097
2098							=item *
2099
2100							Loic MEUNIER <loic.meunier@doct.uliege.be>
2101
2102							=item *
2103
2104							Mick VAN VLIERBERGHE <mvanvlierberghe@doct.uliege.be>
2105
2106							=back
2107
2108							=head1 COPYRIGHT AND LICENSE
2109
2110							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
2111
2112							This is free software; you can redistribute it and/or modify it under
2113							the same terms as the Perl 5 programming language system itself.
2114
2115							=cut