File Coverage

blib/lib/String/Tagged.pm
Criterion Covered Total %
statement 527 563 93.6
branch 221 256 86.3
condition 77 89 86.5
subroutine 53 54 98.1
pod 34 34 100.0
total 912 996 91.5


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2008-2023 -- leonerd@leonerd.org.uk
5              
6             package String::Tagged 0.22;
7              
8 20     20   4688169 use v5.14;
  20         188  
9 20     20   118 use warnings;
  20         38  
  20         653  
10              
11 20     20   114 use Scalar::Util qw( blessed );
  20         36  
  20         1336  
12              
13             require String::Tagged::Extent;
14              
15             use constant {
16 20         1947 FLAG_ANCHOR_BEFORE => 0x01,
17             FLAG_ANCHOR_AFTER => 0x02,
18             FLAG_ITERATING => 0x04,
19             FLAG_DELETED => 0x08,
20 20     20   134 };
  20         37  
21              
22 20     20   144 use constant DEBUG => 0;
  20         49  
  20         1080  
23              
24             # Since we're providing overloading, we should set fallback by default
25 20     20   126 use overload fallback => 1;
  20         38  
  20         171  
26              
27             =head1 NAME
28              
29             C - string buffers with value tags on extents
30              
31             =head1 SYNOPSIS
32              
33             use String::Tagged;
34              
35             my $st = String::Tagged->new( "An important message" );
36              
37             $st->apply_tag( 3, 9, bold => 1 );
38              
39             $st->iter_substr_nooverlap(
40             sub {
41             my ( $substring, %tags ) = @_;
42              
43             print $tags{bold} ? "$substring"
44             : $substring;
45             }
46             );
47              
48             =head1 DESCRIPTION
49              
50             This module implements an object class, instances of which store a (mutable)
51             string buffer that supports tags. A tag is a name/value pair that applies to
52             some extent of the underlying string.
53              
54             The types of tag names ought to be strings, or at least values that are
55             well-behaved as strings, as the names will often be used as the keys in hashes
56             or applied to the C operator.
57              
58             The types of tag values are not restricted - any scalar will do. This could be
59             a simple integer or string, ARRAY or HASH reference, or even a CODE reference
60             containing an event handler of some kind.
61              
62             Tags may be arbitrarily overlapped. Any given offset within the string has in
63             effect, a set of uniquely named tags. Tags of different names are independent.
64             For tags of the same name, only the latest, shortest tag takes effect.
65              
66             For example, consider a string with three tags represented here:
67              
68             Here is my string with tags
69             [-------------------------] foo => 1
70             [-------] foo => 2
71             [---] bar => 3
72              
73             Every character in this string has a tag named C. The value of this tag
74             is 2 for the words C and C and the space inbetween, and 1
75             elsewhere. Additionally, the words C and C and the space between them
76             also have the tag C with a value 3.
77              
78             Since C does not understand the significance of the tag values
79             it therefore cannot detect if two neighbouring tags really contain the same
80             semantic idea. Consider the following string:
81              
82             A string with words
83             [-------] type => "message"
84             [--------] type => "message"
85              
86             This string contains two tags. C will treat this as two
87             different tag values as far as C is concerned, even
88             though C yields the same value for the C tag at any position
89             in the string. The C method may be used to merge tag extents of
90             tags that should be considered as equal.
91              
92             =head1 NAMING
93              
94             I spent a lot of time considering the name for this module. It seems that a
95             number of people across a number of languages all created similar
96             functionality, though named very differently. For the benefit of
97             keyword-based search tools and similar, here's a list of some other names this
98             sort of object might be known by:
99              
100             =over 4
101              
102             =item *
103              
104             Extents
105              
106             =item *
107              
108             Overlays
109              
110             =item *
111              
112             Attribute or attributed strings
113              
114             =item *
115              
116             Markup
117              
118             =item *
119              
120             Out-of-band data
121              
122             =back
123              
124             =cut
125              
126             *is_string_tagged =
127             # It would be nice if we could #ifdef HAVE_PERL_VERSION(...)
128             ( $] >= 5.034 ) ?
129             do { eval 'use experimental "isa"; sub { $_[0] isa __PACKAGE__ }' // die $@ } :
130 288 100   288   1536 do { sub { blessed $_[0] and $_[0]->isa( __PACKAGE__ ) } };
131              
132             =head1 CONSTRUCTOR
133              
134             =cut
135              
136             =head2 new
137              
138             $st = String::Tagged->new( $str )
139              
140             Returns a new instance of a C object. It will contain no tags.
141             If the optional C<$str> argument is supplied, the string buffer will be
142             initialised from this value.
143              
144             If C<$str> is a C object then it will be cloned, as if calling
145             the C method on it.
146              
147             =cut
148              
149             sub new
150             {
151 137     137 1 25527 my $class = shift;
152 137         341 my ( $str ) = @_;
153              
154 137 100       318 return $class->clone( $str ) if is_string_tagged( $str );
155              
156 117 100       325 $str = "" unless defined $str;
157              
158 117         615 return bless {
159             str => "$str",
160             tags => [],
161             }, $class;
162             }
163              
164             =head2 new_tagged
165              
166             $st = String::Tagged->new_tagged( $str, %tags )
167              
168             Shortcut for creating a new C object with the given tags
169             applied to the entire length. The tags will not be anchored at either end.
170              
171             =cut
172              
173             sub new_tagged
174             {
175 8     8 1 1676 my $class = shift;
176 8         28 my ( $str, %tags ) = @_;
177              
178 8         24 my $self = $class->new( $str );
179              
180 8         25 my $length = $self->length;
181 8         43 $self->apply_tag( 0, $length, $_ => $tags{$_} ) for keys %tags;
182              
183 8         31 return $self;
184             }
185              
186             =head2 clone (class)
187              
188             $new = String::Tagged->clone( $orig, %opts )
189              
190             Returns a new instance of C made by cloning the original,
191             subject to the options provided. The returned instance will be in the
192             requested class, which need not match the class of the original.
193              
194             The following options are recognised:
195              
196             =over 4
197              
198             =item only_tags => ARRAY
199              
200             If present, gives an ARRAY reference containing tag names. Only those tags
201             named here will be copied; others will be ignored.
202              
203             =item except_tags => ARRAY
204              
205             If present, gives an ARRAY reference containing tag names. All tags will be
206             copied except those named here.
207              
208             =item convert_tags => HASH
209              
210             If present, gives a HASH reference containing tag conversion functions. For
211             any tags in the original to be copied whose names appear in the hash, the
212             name and value are passed into the corresponding function, which should return
213             an even-sized key/value list giving a tag, or a list of tags, to apply to the
214             new clone.
215              
216             my @new_tags = $convert_tags->{$orig_name}->( $orig_name, $orig_value )
217             # Where @new_tags is ( $new_name, $new_value, $new_name_2, $new_value_2, ... )
218              
219             As a further convenience, if the value for a given tag name is a plain string
220             instead of a code reference, it gives the new name for the tag, and will be
221             applied with its existing value.
222              
223             If C is being used too, then the source names of any tags to be
224             converted must also be listed there, or they will not be copied.
225              
226             =item start => INT
227              
228             I
229              
230             Start at the given position; defaults to 0.
231              
232             =item end => INT
233              
234             I
235              
236             End after the given position; defaults to end of string. This option overrides
237             C.
238              
239             =item len => INT
240              
241             End after the given length beyond the start position; defaults to end of
242             string. This option only applies if C is not given.
243              
244             =back
245              
246             =head2 clone (instance)
247              
248             $new = $orig->clone( %args )
249              
250             Called as an instance (rather than a class) method, the newly-cloned instance
251             is returned in the same class as the original.
252              
253             =cut
254              
255             sub clone
256             {
257 56 100   56 1 2961 my ( $class, $orig ) = blessed $_[0] ?
258             ( ref $_[0], shift ) :
259             ( shift, shift );
260 56         155 my %opts = @_;
261              
262             my $only = exists $opts{only_tags} ?
263 56 100       145 { map { $_ => 1 } @{ $opts{only_tags} } } :
  1         4  
  1         4  
264             undef;
265              
266             my $except = exists $opts{except_tags} ?
267 56 50       134 { map { $_ => 1 } @{ $opts{except_tags} } } :
  0         0  
  0         0  
268             undef;
269              
270 56         122 my $convert = $opts{convert_tags};
271              
272 56         167 my $origstr = $orig->str;
273              
274 56   100     209 my $start = $opts{start} // 0;
275             my $end = $opts{end} //
276             ( defined $opts{len} ? $start + $opts{len}
277 56 100 100     286 : length $origstr );
278              
279 56         96 my $len = $end - $start;
280              
281 56         201 my $new = $class->new( substr $origstr, $start, $end - $start );
282              
283 56         162 my $tags = $orig->{tags};
284              
285             # We know we're only looking
286 56         148 foreach my $t ( @$tags ) {
287 63         143 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
288              
289 63 100       158 next if $te < $start;
290 58 100       141 last if $ts >= $end;
291              
292 55 100 100     168 next if $only and not $only->{$tn};
293 54 0 33     103 next if $except and $except->{$tn};
294              
295 54         85 my @tags;
296 54 100 66     141 if( $convert and my $c = $convert->{$tn} ) {
297 2 100       8 if( ref $c eq "CODE" ) {
298 1         6 @tags = $c->( $tn, $tv );
299             }
300             else {
301 1         4 @tags = ( $c, $tv );
302             }
303             }
304             else {
305 52         109 @tags = ( $tn, $tv );
306             }
307              
308 54         136 $_ -= $start for $ts, $te;
309              
310 54 100       134 my $tl = $te - ( $ts < 0 ? 0 : $ts );
311              
312 54 100       142 next if $te <= 0;
313 53 100 100     248 $ts = -1 if $ts < 0 or $tf & FLAG_ANCHOR_BEFORE;
314 53 100 100     201 $tl = -1 if $te > $len or $tf & FLAG_ANCHOR_AFTER;
315              
316 53         140 while( @tags ) {
317 53         150 $new->apply_tag( $ts, $tl, shift @tags, shift @tags );
318             }
319             }
320              
321 56         266 return $new;
322             }
323              
324             sub _mkextent
325             {
326 191     191   270 my $self = shift;
327 191         355 my ( $start, $end, $flags ) = @_;
328              
329 191         290 $flags &= (FLAG_ANCHOR_BEFORE|FLAG_ANCHOR_AFTER);
330              
331 191         913 return bless [ $self, $start, $end, $flags ], 'String::Tagged::Extent';
332             }
333              
334             =head2 from_sprintf
335              
336             $str = String::Tagged->from_sprintf( $format, @args )
337              
338             I
339              
340             Returns a new instance of a C object, initialised by
341             formatting the supplied arguments using the supplied format.
342              
343             The C<$format> string is similar to that supported by the core C
344             operator, though a few features such as out-of-order argument indexing and
345             vector formatting are missing. This format string may be a plain perl string,
346             or an instance of C. In the latter case, any tags within it
347             are preserved in the result.
348              
349             In the case of a C<%s> conversion, the value of the argument consumed may
350             itself be a C instance. In this case it will be appended to
351             the returned object, preserving any tags within it.
352              
353             All other conversions are handled individually by the core C
354             operator and appended to the result.
355              
356             =cut
357              
358             sub from_sprintf
359             {
360 12     12 1 1018 my $class = shift;
361 12         28 my ( $format, @args ) = @_;
362              
363             # Clone the format string into the candidate return value, and then
364             # repeatedly replace %... expansions with their required value using
365             # ->set_substr, so that embedded tags in the format will behave sensibly.
366              
367 12 100       28 my $ret = ( is_string_tagged( $format ) ) ?
368             $class->clone( $format ) :
369             $class->new( $format );
370              
371 12         21 my $pos = 0;
372              
373 12         52 while( $pos < length $ret ) {
374 22         41 my $str = "$ret";
375 22         59 pos( $str ) = $pos;
376              
377 22         38 my $replacement;
378              
379 22 100       144 if( $str =~ m/\G[^%]+/gc ) {
    100          
    100          
    50          
    0          
380             # A literal span
381 9         20 $pos = $+[0];
382 9         23 next;
383             }
384             elsif( $str =~ m/\G%%/gc ) {
385             # A literal %% conversion
386 1         2 $replacement = "%";
387             }
388             elsif( $str =~ m/\G%([-]?)(\d+|\*)?(?:\.(\d+|\*))?s/gc ) {
389             # A string
390 10         39 my ( $flags, $width, $precision ) = ( $1, $2, $3 );
391 10 100 100     33 $width = shift @args if defined $width and $width eq "*";
392 10 100 100     35 $precision = shift @args if defined $precision and $precision eq "*";
393 10         17 my $arg = shift @args;
394              
395 10 50       20 defined $arg or do {
396 0         0 warnings::warnif( uninitialized => "Use of ininitialized value in String::Tagged->from_sprintf" );
397 0         0 $arg = "";
398             };
399              
400 10 100       20 if( defined $precision ) {
401 2 50       6 if( is_string_tagged( $arg ) ) {
402 0         0 $arg = $arg->substr( 0, $precision );
403             }
404             else {
405 2         10 $arg = substr $arg, 0, $precision;
406             }
407             }
408              
409 10         16 my $leftalign = $flags =~ m/-/;
410              
411 10 100       22 my $padding = defined $width ? $width - length $arg : 0;
412 10 100       18 $padding = 0 if $padding < 0;
413              
414 10         17 $replacement = "";
415              
416 10 100       26 $replacement .= " " x $padding if !$leftalign;
417              
418 10         18 $replacement .= $arg;
419              
420 10 100       24 $replacement .= " " x $padding if $leftalign;
421             }
422             elsif( $str =~ m/\G%(.*?)([cduoxefgXEGbBpaAiDUOF])/gc ) {
423             # Another conversion format
424 2         8 my ( $template, $flags ) = ( $2, $1 );
425 2         3 my $argc = 1;
426 2         4 $argc += ( () = $flags =~ m/\*/g );
427              
428 2         12 $replacement = sprintf "%$flags$template", @args[0..$argc-1];
429 2         5 splice @args, 0, $argc;
430             }
431             elsif( $str =~ m/\G%(.*?)([a-zA-Z])/gc ) {
432 0         0 warn "Unrecognised sprintf conversion %$2";
433             }
434             else {
435             # must be at EOF now
436 0         0 last;
437             }
438              
439 13         45 my $templatelen = $+[0] - $-[0];
440 13         50 $ret->set_substr( $-[0], $templatelen, $replacement );
441              
442 13         32 $pos += length( $replacement );
443             }
444              
445 12         49 return $ret;
446             }
447              
448             =head2 join
449              
450             $str = String::Tagged->join( $sep, @parts )
451              
452             I
453              
454             Returns a new instance of a C object, formed by concatenating
455             each of the component piece together, joined with the separator string.
456              
457             The result will be much like the core C function, except that it will
458             preserve tags in the resulting string.
459              
460             =cut
461              
462             sub join
463             {
464 1     1 1 2 my $class = shift;
465 1         4 my ( $sep, @parts ) = @_;
466              
467 1 50       2 is_string_tagged( $sep ) or
468             $sep = $class->new( $sep );
469              
470 1         3 my $ret = shift @parts;
471 1         5 $ret .= $sep . $_ for @parts;
472              
473 1         4 return $ret;
474             }
475              
476             =head1 METHODS
477              
478             =cut
479              
480             =head2 str
481              
482             $str = $st->str
483              
484             $str = "$st"
485              
486             Returns the plain string contained within the object.
487              
488             This method is also called for stringification; so the C
489             object can be used in a plain string interpolation such as
490              
491             my $message = String::Tagged->new( "Hello world" );
492             print "My message is $message\n";
493              
494             =cut
495              
496 20     20   32309 use overload '""' => 'str';
  20         61  
  20         160  
497              
498             sub str
499             {
500 227     227 1 20190 my $self = shift;
501 227         792 return $self->{str};
502             }
503              
504             =head2 length
505              
506             $len = $st->length
507              
508             $len = length( $st )
509              
510             Returns the length of the plain string. Because stringification works on this
511             object class, the normal core C function works correctly on it.
512              
513             =cut
514              
515             sub length
516             {
517 412     412 1 598 my $self = shift;
518 412         1129 return CORE::length $self->{str};
519             }
520              
521             =head2 substr
522              
523             $str = $st->substr( $start, $len )
524              
525             Returns a C instance representing a section from within the
526             given string, containing all the same tags at the same conceptual positions.
527              
528             =cut
529              
530             sub substr
531             {
532 27     27 1 54 my $self = shift;
533 27         53 my ( $start, $len ) = @_;
534              
535 27         72 return $self->clone( start => $start, len => $len );
536             }
537              
538             =head2 plain_substr
539              
540             $str = $st->plain_substr( $start, $len )
541              
542             Returns as a plain perl string, the substring at the given position. This will
543             be the same string data as returned by C, only as a plain string
544             without the tags
545              
546             =cut
547              
548             sub plain_substr
549             {
550 23     23 1 42 my $self = shift;
551 23         57 my ( $start, $len ) = @_;
552              
553 23         108 return CORE::substr( $self->{str}, $start, $len );
554             }
555              
556             sub _cmp_tags
557             {
558 154     154   265 my ( $as, $ae ) = @$a;
559 154         240 my ( $bs, $be ) = @$b;
560              
561             # Sort by start first; shortest first
562 154   100     636 return $as <=> $bs ||
563             $ae <=> $be;
564             }
565              
566             sub _assert_sorted
567             {
568 0     0   0 my $self = shift;
569              
570 0         0 my $tags = $self->{tags};
571             # If fewer than 2 tags, must be sorted
572 0 0       0 return if @$tags < 2;
573              
574 0         0 my $prev = $tags->[0];
575              
576 0         0 for( my $i = 1; $i < @$tags; $i++ ) {
577 0         0 my $here = $tags->[$i];
578 0         0 local ( $a, $b ) = ( $prev, $here );
579 0 0       0 if( _cmp_tags() <= 0 ) {
580 0         0 $prev = $here;
581 0         0 next;
582             }
583              
584 0         0 print STDERR "Tag order violation at i=$i\n";
585 0         0 print STDERR "[@{[ $i - 1 ]}] = [ $tags->[$i-1]->[0], $tags->[$i-1]->[1] ]\n";
  0         0  
586 0         0 print STDERR "[@{[ $i ]}] = [ $tags->[$i]->[0], $tags->[$i]->[1] ]\n";
  0         0  
587 0         0 die "Assert failure";
588             }
589             }
590              
591             sub _insert_tag
592             {
593 159     159   227 my $self = shift;
594 159         311 my ( $start, $end, $name, $value, $flags ) = @_;
595              
596 159         250 my $tags = $self->{tags};
597              
598 159         338 my $newtag = [ $start, $end, $name => $value, $flags ];
599              
600             # Specialcase - if there's no tags yet, just push it
601 159 100       368 if( @$tags == 0 ) {
602 81         199 push @$tags, $newtag;
603 81         157 return;
604             }
605              
606 78         160 local $a = $newtag;
607              
608             # Two more special cases - it's quite likely we're either inserting an
609             # 'everywhere' tag, or appending one to the end. Check the endpoints first
610 78         114 local $b;
611              
612 78         120 $b = $tags->[0];
613 78 100       148 if( _cmp_tags() <= 0 ) {
614 14         34 unshift @$tags, $newtag;
615 14         34 return;
616             }
617              
618 64         131 $b = $tags->[-1];
619 64 100       108 if( _cmp_tags() >= 0 ) {
620 62         148 push @$tags, $newtag;
621 62         139 return;
622             }
623              
624 2         5 my $range_start = 0;
625 2         6 my $range_end = $#$tags;
626              
627 2         2 my $inspos;
628              
629 2         8 while( $range_end > $range_start ) {
630 2         10 my $i = int( ( $range_start + $range_end ) / 2 );
631              
632 2         6 $b = $tags->[$i];
633 2         6 my $cmp = _cmp_tags;
634              
635 2 50       8 if( $cmp > 0 ) {
    0          
636 2         15 $range_start = $i + 1;
637             }
638             elsif( $cmp < 0 ) {
639 0         0 $range_end = $i; # open interval
640             }
641             else {
642 0         0 $inspos = $i;
643 0         0 last;
644             }
645              
646 2 50       12 if( $range_start == $range_end ) {
647 2         4 $inspos = $range_start;
648 2         5 last;
649             }
650             }
651              
652 2 50       7 $inspos = $range_end unless defined $inspos;
653              
654 2 50       9 $inspos = 0 if $inspos < 0;
655 2 50       8 $inspos = @$tags if $inspos > @$tags;
656              
657 2         8 splice @$tags, $inspos, 0, $newtag;
658              
659 2         5 $self->_assert_sorted if DEBUG;
660             }
661              
662             =head2 apply_tag
663              
664             $st->apply_tag( $start, $len, $name, $value )
665              
666             Apply the named tag value to the given extent. The tag will start on the
667             character at the C<$start> index, and continue for the next C<$len>
668             characters.
669              
670             If C<$start> is given as -1, the tag will be considered to start "before" the
671             actual string. If C<$len> is given as -1, the tag will be considered to
672             end "after" end of the actual string. These special limits are used by
673             C when deciding whether to move a tag boundary. The start of any
674             tag that starts "before" the string is never moved, even if more text is
675             inserted at the beginning. Similarly, a tag which ends "after" the end of the
676             string, will continue to the end even if more text is appended.
677              
678             This method returns the C<$st> object.
679              
680             $st->apply_tag( $e, $name, $value )
681              
682             Alternatively, an existing L object can be passed as
683             the first argument instead of two integers. The new tag will apply at the
684             given extent.
685              
686             =cut
687              
688             sub apply_tag
689             {
690 156     156 1 7028 my $self = shift;
691 156         253 my ( $start, $end );
692 156         224 my $flags = 0;
693              
694 156 50       425 if( blessed $_[0] ) {
695 0         0 my $e = shift;
696 0         0 $start = $e->start;
697 0         0 $end = $e->end;
698              
699 0 0       0 $flags |= FLAG_ANCHOR_BEFORE if $e->anchor_before;
700 0 0       0 $flags |= FLAG_ANCHOR_AFTER if $e->anchor_after;
701             }
702             else {
703 156         231 $start = shift;
704 156         203 my $len = shift;
705              
706 156         309 my $strlen = $self->length;
707              
708 156 100       358 if( $start < 0 ) {
709 34         50 $start = 0;
710 34         61 $flags |= FLAG_ANCHOR_BEFORE;
711             }
712              
713 156 100       312 if( $len == -1 ) {
714 36         52 $end = $strlen;
715 36         55 $flags |= FLAG_ANCHOR_AFTER;
716             }
717             else {
718 120         171 $end = $start + $len;
719 120 100       260 $end = $strlen if $end > $strlen;
720             }
721             }
722              
723 156         297 my ( $name, $value ) = @_;
724              
725 156         395 $self->_insert_tag( $start, $end, $name, $value, $flags );
726              
727 156         399 return $self;
728             }
729              
730             sub _remove_tag
731             {
732 9     9   14 my $self = shift;
733 9         13 my $keepends = shift;
734 9         12 my ( $start, $end );
735              
736 9 100       26 if( blessed $_[0] ) {
737 5         10 my $e = shift;
738 5         9 $start = $e->start;
739 5         11 $end = $e->end;
740             }
741             else {
742 4         6 $start = shift;
743 4         8 $end = $start + shift;
744             }
745              
746 9         19 my ( $name ) = @_;
747              
748 9 100       20 if( my $t = $self->{iterating} ) {
749 5         22 my ( $ts, $te, $tn ) = @$t;
750 5 50 33     27 if( $start == $ts and $end == $te and $name eq $tn ) {
      33        
751 5         7 $t->[4] |= FLAG_DELETED;
752 5         12 return;
753             }
754             }
755              
756 4         8 my $tags = $self->{tags};
757              
758 4         6 my $have_added = 0;
759              
760             # Can't foreach() because we modify $i
761 4         14 for( my $i = 0; $i < @$tags; $i++ ) {
762 8         11 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  8         18  
763              
764 8 100       21 next if $te <= $start;
765 7 100       14 last if $ts >= $end;
766              
767 4 50       9 next if $tn ne $name;
768              
769 4 100 100     20 if( $keepends and $end < $te ) {
770 2         8 $self->_insert_tag( $end, $te, $tn, $tv, $tf & ~(FLAG_ANCHOR_BEFORE|FLAG_ITERATING) );
771 2         4 $have_added = 1;
772             }
773              
774 4 50       11 if( $tf & FLAG_ITERATING ) {
775 0         0 die "ARGH encountered FLAG_ITERATING while walking the list of tags during ->_remove_tag";
776             }
777              
778 4         8 splice @$tags, $i, 1;
779              
780 4 100 100     18 if( $keepends and $ts < $start ) {
781 1         5 $self->_insert_tag( $ts, $start, $tn, $tv, $tf & ~(FLAG_ANCHOR_AFTER|FLAG_ITERATING) );
782 1         3 $have_added = 1;
783             }
784             else {
785 3         19 $i--;
786             }
787             }
788              
789 4         6 if( DEBUG && $have_added ) {
790             $self->_assert_sorted;
791             }
792              
793 4         27 return $self;
794             }
795              
796             =head2 unapply_tag
797              
798             $st->unapply_tag( $start, $len, $name )
799              
800             Unapply the named tag value from the given extent. If the tag extends beyond
801             this extent, then any partial fragment of the tag will be left in the string.
802              
803             This method returns the C<$st> object.
804              
805             $st->unapply_tag( $e, $name )
806              
807             Alternatively, an existing L object can be passed as
808             the first argument instead of two integers.
809              
810             =cut
811              
812             sub unapply_tag
813             {
814 3     3 1 7 my $self = shift;
815 3         9 return $self->_remove_tag( 1, @_ );
816             }
817              
818             =head2 delete_tag
819              
820             $st->delete_tag( $start, $len, $name )
821              
822             Delete the named tag within the given extent. Entire tags are removed, even if
823             they extend beyond this extent.
824              
825             This method returns the C<$st> object.
826              
827             $st->delete_tag( $e, $name )
828              
829             Alternatively, an existing L object can be passed as
830             the first argument instead of two integers.
831              
832             =cut
833              
834             sub delete_tag
835             {
836 6     6 1 8670 my $self = shift;
837 6         14 return $self->_remove_tag( 0, @_ );
838             }
839              
840             =head2 delete_all_tag
841              
842             $st->delete_all_tag( $name )
843              
844             I
845              
846             Deletes every tag with the given name. This is more efficient than calling
847             C to list the tags then C on each one individually
848             in the case of a simple name match.
849              
850             This method returns the C<$st> object.
851              
852             =cut
853              
854             sub delete_all_tag
855             {
856 1     1 1 8 my $self = shift;
857 1         4 my ( $name ) = @_;
858              
859 1         2 my $tags = $self->{tags};
860              
861 1         8 for( my $i = 0; $i < @$tags; $i++ ) {
862 3         4 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  3         8  
863              
864 3 100       9 next if $tn ne $name;
865              
866 2         5 splice @$tags, $i, 1, ();
867 2         5 $i--;
868             }
869              
870 1         6 return $self;
871             }
872              
873             =head2 merge_tags
874              
875             $st->merge_tags( $eqsub )
876              
877             Merge neighbouring or overlapping tags of the same name and equal values.
878              
879             For each pair of tags of the same name that apply on neighbouring or
880             overlapping extents, the C<$eqsub> callback is called, as
881              
882             $equal = $eqsub->( $name, $value_a, $value_b )
883              
884             If this function returns true then the tags are merged.
885              
886             The equallity test function is free to perform any comparison of the values
887             that may be relevant to the application; for example it may deeply compare
888             referred structures and check for equivalence in some application-defined
889             manner. In this case, the first tag of a pair is retained, the second is
890             deleted. This may be relevant if the tag value is a reference to some object.
891              
892             =cut
893              
894             sub merge_tags
895             {
896 8     8 1 8766 my $self = shift;
897 8         13 my ( $eqsub ) = @_;
898              
899 8         16 my $tags = $self->{tags};
900              
901             # Can't foreach() because we modify @$tags
902 8         23 OUTER: for( my $i = 0; $i < @$tags; $i++ ) {
903 13         24 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  13         31  
904              
905 13         40 for( my $j = $i+1; $j < @$tags; $j++ ) {
906 11         17 my ( $t2s, $t2e, $t2n, $t2v, $t2f ) = @{ $tags->[$j] };
  11         21  
907              
908 11 100       26 last if $t2s > $te;
909 10 50       24 next unless $t2s <= $te;
910 10 100       40 next unless $t2n eq $tn;
911              
912 7 100       18 last unless $eqsub->( $tn, $tv, $t2v );
913              
914             # Need to delete the tag at $j, extend the end of the tag at $i, and
915             # possibly move $i later
916 6         34 splice @$tags, $j, 1, ();
917 6         11 $j--;
918              
919 6         11 $te = $tags->[$i][1] = $t2e;
920              
921 6 100       30 $tags->[$i][4] |= FLAG_ANCHOR_AFTER if $t2f & FLAG_ANCHOR_AFTER;
922              
923 6         13 local $a = $tags->[$i];
924              
925 6 100 100     39 if( local $b = $tags->[$i+1] and _cmp_tags() > 0 ) {
926 1         3 my $newpos = $i+1;
927 1   33     6 while( local $b = $tags->[$newpos ] and _cmp_tags() <= 0 ) {
928 0         0 $newpos++;
929             }
930              
931 1         4 splice @$tags, $newpos, 0, splice @$tags, $i, 1, ();
932              
933 1         3 redo OUTER;
934             }
935             }
936             }
937             }
938              
939             =head2 iter_extents
940              
941             $st->iter_extents( $callback, %opts )
942              
943             Iterate the tags stored in the string. For each tag, the CODE reference in
944             C<$callback> is invoked once, being passed a L object
945             that represents the extent of the tag.
946              
947             $callback->( $extent, $tagname, $tagvalue )
948              
949             Options passed in C<%opts> may include:
950              
951             =over 4
952              
953             =item start => INT
954              
955             Start at the given position; defaults to 0.
956              
957             =item end => INT
958              
959             End after the given position; defaults to end of string. This option overrides
960             C.
961              
962             =item len => INT
963              
964             End after the given length beyond the start position; defaults to end of
965             string. This option only applies if C is not given.
966              
967             =item only => ARRAY
968              
969             Select only the tags named in the given ARRAY reference.
970              
971             =item except => ARRAY
972              
973             Select all the tags except those named in the given ARRAY reference.
974              
975             =back
976              
977             I it is safe to call C from within the
978             callback function to remove the tag currently being iterated on.
979              
980             $str->iter_extents( sub {
981             my ( $e, $n, $v ) = @_;
982             $str->delete_tag( $e, $n ) if $n =~ m/^tmp_/;
983             } );
984              
985             Apart from this scenario, the tags in the string should not otherwise be added
986             or removed while the iteration is occurring.
987              
988             =cut
989              
990             sub iter_extents
991             {
992 51     51 1 131 my $self = shift;
993 51         101 my ( $callback, %opts ) = @_;
994              
995             my $start = exists $opts{start} ? $opts{start} :
996 51 100       163 0;
997              
998             my $end = exists $opts{end} ? $opts{end} :
999             exists $opts{len} ? $start + $opts{len} :
1000 51 50       223 $self->length + 1; # so as to include zerolen at end
    100          
1001              
1002 51 100       139 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         4  
  1         3  
1003             undef;
1004              
1005 51 100       119 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         3  
  1         5  
1006             undef;
1007              
1008 51         113 my $tags = $self->{tags};
1009              
1010 51         190 for ( my $i = 0; $i < @$tags; $i++ ) {
1011 76         146 my $t = $tags->[$i];
1012 76         184 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
1013              
1014 76 100       187 next if $te < $start;
1015 75 100       170 last if $ts >= $end;
1016              
1017 74 100 100     225 next if $only and !$only->{$tn};
1018 72 100 100     160 next if $except and $except->{$tn};
1019              
1020 71         116 $t->[4] |= FLAG_ITERATING;
1021 71         155 local $self->{iterating} = $t;
1022              
1023 71         168 $callback->( $self->_mkextent( $ts, $te, $tf ), $tn, $tv );
1024              
1025 71         4519 $t->[4] &= ~FLAG_ITERATING;
1026              
1027 71 100       355 if( $t->[4] & FLAG_DELETED ) {
1028 5         8 splice @$tags, $i, 1, ();
1029 5         15 $i--;
1030             }
1031             }
1032             }
1033              
1034             =head2 iter_tags
1035              
1036             $st->iter_tags( $callback, %opts )
1037              
1038             Iterate the tags stored in the string. For each tag, the CODE reference in
1039             C<$callback> is invoked once, being passed the start point and length of the
1040             tag.
1041              
1042             $callback->( $start, $length, $tagname, $tagvalue )
1043              
1044             Options passed in C<%opts> are the same as for C.
1045              
1046             =cut
1047              
1048             sub iter_tags
1049             {
1050 25     25 1 12087 my $self = shift;
1051 25         61 my ( $callback, %opts ) = @_;
1052              
1053             $self->iter_extents(
1054             sub {
1055 36     36   76 my ( $e, $tn, $tv ) = @_;
1056 36         98 $callback->( $e->start, $e->length, $tn, $tv );
1057             },
1058 25         133 %opts
1059             );
1060             }
1061              
1062             =head2 iter_extents_nooverlap
1063              
1064             $st->iter_extents_nooverlap( $callback, %opts )
1065              
1066             Iterate non-overlapping extents of tags stored in the string. The CODE
1067             reference in C<$callback> is invoked for each extent in the string where no
1068             tags change. The entire set of tags active in that extent is given to the
1069             callback. Because the extent covers possibly-multiple tags, it will not define
1070             the C and C flags.
1071              
1072             $callback->( $extent, %tags )
1073              
1074             The callback will be invoked over the entire length of the string, including
1075             any extents with no tags applied.
1076              
1077             Options may be passed in C<%opts> to control the range of the string iterated
1078             over, in the same way as the C method.
1079              
1080             If the C or C filters are applied, then only the tags that
1081             survive filtering will be present in the C<%tags> hash. Tags that are excluded
1082             by the filtering will not be present, nor will their bounds be used to split
1083             the string into extents.
1084              
1085             =cut
1086              
1087             sub iter_extents_nooverlap
1088             {
1089 34     34 1 68 my $self = shift;
1090 34         60 my ( $callback, %opts ) = @_;
1091              
1092             my $start = exists $opts{start} ? $opts{start} :
1093 34 100       89 0;
1094              
1095             my $end = exists $opts{end} ? $opts{end} :
1096             exists $opts{len} ? $start + $opts{len} :
1097 34 50       144 $self->length;
    100          
1098              
1099 34 100       85 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         4  
  1         4  
1100             undef;
1101              
1102 34 100       74 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         4  
  1         3  
1103             undef;
1104              
1105 34         63 my $tags = $self->{tags};
1106              
1107 34         50 my @active; # ARRAY of [ $ts, $te, $tn, $tv ]
1108 34         46 my $pos = $start;
1109              
1110 34         80 foreach my $t ( @$tags ) {
1111 76         152 my ( $ts, $te, $tn, $tv ) = @$t;
1112              
1113 76 100       164 next if $te < $start;
1114 74 100       140 last if $ts > $end;
1115              
1116 72 100 100     167 next if $only and !$only->{$tn};
1117 70 100 100     137 next if $except and $except->{$tn};
1118              
1119 69         163 while( $pos < $ts ) {
1120 43         74 my %activetags;
1121             my %tagends;
1122 43         72 my $rangeend = $ts;
1123              
1124 43         93 foreach ( @active ) {
1125 42         86 my ( undef, $e, $n, $v ) = @$_;
1126              
1127 42 100       84 $e < $rangeend and $rangeend = $e;
1128 42 100 66     117 next if $tagends{$n} and $tagends{$n} < $e;
1129              
1130 41         80 $activetags{$n} = $v;
1131 41         95 $tagends{$n} = $e;
1132             }
1133              
1134 43         103 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1135              
1136 43         440 $pos = $rangeend;
1137 43         87 @active = grep { $_->[1] > $pos } @active;
  42         161  
1138             }
1139              
1140 69         199 push @active, [ $ts, $te, $tn, $tv ];
1141             }
1142              
1143 34         79 while( $pos < $end ) {
1144 53         82 my %activetags;
1145             my %tagends;
1146 53         81 my $rangeend = $end;
1147              
1148 53         89 foreach ( @active ) {
1149 72         128 my ( undef, $e, $n, $v ) = @$_;
1150              
1151 72 100       146 $e < $rangeend and $rangeend = $e;
1152 72 100 100     181 next if $tagends{$n} and $tagends{$n} < $e;
1153              
1154 71         125 $activetags{$n} = $v;
1155 71         128 $tagends{$n} = $e;
1156             }
1157              
1158 53         122 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1159              
1160 53         1558 $pos = $rangeend;
1161 53         128 @active = grep { $_->[1] > $pos } @active;
  72         273  
1162             }
1163              
1164             # We might have zero-length tags active at the very end of the range
1165 34 100       222 if( my @zerolen = grep { $_->[0] == $pos and $_->[1] == $pos } @active ) {
  3 100       23  
1166 1         4 my %activetags;
1167 1         5 foreach ( @active ) {
1168 1         2 my ( undef, undef, $n, $v ) = @$_;
1169              
1170 1         3 $activetags{$n} = $v;
1171             }
1172              
1173 1         4 $callback->( $self->_mkextent( $pos, $pos, 0 ), %activetags );
1174             }
1175             }
1176              
1177             =head2 iter_tags_nooverlap
1178              
1179             $st->iter_tags_nooverlap( $callback, %opts )
1180              
1181             Iterate extents of the string using C, but passing
1182             the start and length of each extent to the callback instead of the extent
1183             object.
1184              
1185             $callback->( $start, $length, %tags )
1186              
1187             Options may be passed in C<%opts> to control the range of the string iterated
1188             over, in the same way as the C method.
1189              
1190             =cut
1191              
1192             sub iter_tags_nooverlap
1193             {
1194 28     28 1 8789 my $self = shift;
1195 28         64 my ( $callback, %opts ) = @_;
1196              
1197             $self->iter_extents_nooverlap(
1198             sub {
1199 80     80   168 my ( $e, %tags ) = @_;
1200 80         201 $callback->( $e->start, $e->length, %tags );
1201             },
1202 28         180 %opts
1203             );
1204             }
1205              
1206             =head2 iter_substr_nooverlap
1207              
1208             $st->iter_substr_nooverlap( $callback, %opts )
1209              
1210             Iterate extents of the string using C, but passing the
1211             substring of data instead of the extent object.
1212              
1213             $callback->( $substr, %tags )
1214              
1215             Options may be passed in C<%opts> to control the range of the string iterated
1216             over, in the same way as the C method.
1217              
1218             =cut
1219              
1220             sub iter_substr_nooverlap
1221             {
1222 6     6 1 6374 my $self = shift;
1223 6         18 my ( $callback, %opts ) = @_;
1224              
1225             $self->iter_extents_nooverlap(
1226             sub {
1227 17     17   46 my ( $e, %tags ) = @_;
1228 17         52 $callback->( $e->plain_substr, %tags );
1229             },
1230 6         42 %opts,
1231             );
1232             }
1233              
1234             =head2 tagnames
1235              
1236             @names = $st->tagnames
1237              
1238             Returns the set of tag names used in the string, in no particular order.
1239              
1240             =cut
1241              
1242             sub tagnames
1243             {
1244 14     14 1 75 my $self = shift;
1245              
1246 14         87 my $tags = $self->{tags};
1247              
1248 14         26 my %tags;
1249 14         37 foreach my $t ( @$tags ) {
1250 15         38 $tags{$t->[2]}++;
1251             }
1252              
1253 14         110 keys %tags;
1254             }
1255              
1256             =head2 get_tags_at
1257              
1258             $tags = $st->get_tags_at( $pos )
1259              
1260             Returns a HASH reference of all the tag values active at the given position.
1261              
1262             =cut
1263              
1264             sub get_tags_at
1265             {
1266 11     11 1 1934 my $self = shift;
1267 11         25 my ( $pos ) = @_;
1268              
1269 11         21 my $tags = $self->{tags};
1270              
1271 11         19 my %tags;
1272              
1273             # TODO: turn this into a binary search
1274 11         25 foreach my $t ( @$tags ) {
1275 14         39 my ( $ts, $te, $tn, $tv ) = @$t;
1276              
1277 14 100       32 last if $ts > $pos;
1278 11 100       48 next if $te <= $pos;
1279              
1280 10         31 $tags{$tn} = $tv;
1281             }
1282              
1283 11         59 return \%tags;
1284             }
1285              
1286             =head2 get_tag_at
1287              
1288             $value = $st->get_tag_at( $pos, $name )
1289              
1290             Returns the value of the named tag at the given position, or C if the
1291             tag is not applied there.
1292              
1293             =cut
1294              
1295             sub get_tag_at
1296             {
1297 6     6 1 19 my $self = shift;
1298 6         19 my ( $pos, $name ) = @_;
1299              
1300 6         14 my $tags = $self->{tags};
1301              
1302 6         10 my $value;
1303              
1304 6         18 foreach my $t ( @$tags ) {
1305 15         35 my ( $ts, $te, $tn, $tv ) = @$t;
1306              
1307 15 100       37 last if $ts > $pos;
1308 11 100       43 next if $te <= $pos;
1309              
1310 8 100       35 $value = $tv if $tn eq $name;
1311             }
1312              
1313 6         22 return $value;
1314             }
1315              
1316             =head2 get_tag_extent
1317              
1318             $extent = $st->get_tag_extent( $pos, $name )
1319              
1320             If the named tag applies to the given position, returns a
1321             L object to represent the extent of the tag at that
1322             position. If it does not, C is returned. If an extent is returned it
1323             will define the C and C flags if appropriate.
1324              
1325             =cut
1326              
1327             sub get_tag_extent
1328             {
1329 13     13 1 79 my $self = shift;
1330 13         65 my ( $pos, $name ) = @_;
1331              
1332 13         34 my $tags = $self->{tags};
1333              
1334 13         26 my ( $start, $end, $flags );
1335              
1336 13         30 foreach my $t ( @$tags ) {
1337 19         47 my ( $ts, $te, $tn, undef, $tf ) = @$t;
1338              
1339 19 100       45 last if $ts > $pos;
1340 16 100       51 next if $te <= $pos;
1341              
1342 15 100       45 next unless $tn eq $name;
1343              
1344 12         21 $start = $ts;
1345 12         20 $end = $te;
1346 12         25 $flags = $tf;
1347             }
1348              
1349 13 100       31 if( defined $start ) {
1350 12         62 return $self->_mkextent( $start, $end, $flags );
1351             }
1352             else {
1353 1         4 return undef;
1354             }
1355             }
1356              
1357             =head2 get_tag_missing_extent
1358              
1359             $extent = $st->get_tag_missing_extent( $pos, $name )
1360              
1361             If the named tag does not apply at the given position, returns the extent of
1362             the string around that position that does not have the tag. If it does exist,
1363             C is returned. If an extent is returned it will not define the
1364             C and C flags, as these do not make sense for the
1365             range in which a tag is absent.
1366              
1367             =cut
1368              
1369             sub get_tag_missing_extent
1370             {
1371 3     3 1 454 my $self = shift;
1372 3         7 my ( $pos, $name ) = @_;
1373              
1374 3         5 my $tags = $self->{tags};
1375              
1376 3         6 my $start = 0;
1377              
1378 3         6 foreach my $t ( @$tags ) {
1379 6         13 my ( $ts, $te, $tn ) = @$t;
1380              
1381 6 100       15 next unless $tn eq $name;
1382              
1383 3 100 100     15 if( $ts <= $pos and $te > $pos ) {
1384 1         4 return undef;
1385             }
1386              
1387 2 100       6 if( $ts > $pos ) {
1388 1         4 return $self->_mkextent( $start, $ts, 0 );
1389             }
1390              
1391 1         3 $start = $te;
1392             }
1393              
1394 1         3 return $self->_mkextent( $start, $self->length, 0 );
1395             }
1396              
1397             =head2 set_substr
1398              
1399             $st->set_substr( $start, $len, $newstr )
1400              
1401             Modifies a extent of the underlying plain string to that given. The extents of
1402             tags in the string are adjusted to cope with the modified region, and the
1403             adjustment in length.
1404              
1405             Tags entirely before the replaced extent remain unchanged.
1406              
1407             Tags entirely within the replaced extent are deleted.
1408              
1409             Tags entirely after the replaced extent are moved by appropriate amount to
1410             ensure they still apply to the same characters as before.
1411              
1412             Tags that start before and end after the extent remain, and have their lengths
1413             suitably adjusted.
1414              
1415             Tags that span just the start or end of the extent, but not both, are
1416             truncated, so as to remove the part of the tag applied on the modified extent
1417             but preserving that applied outside.
1418              
1419             If C<$newstr> is a C object, then its tags will be applied to
1420             C<$st> as appropriate. Edge-anchored tags in C<$newstr> will not be extended
1421             through C<$st>, though they will apply as edge-anchored if they now sit at the
1422             edge of the new string.
1423              
1424             =cut
1425              
1426             sub set_substr
1427             {
1428 56     56 1 14821 my $self = shift;
1429 56         125 my ( $start, $len, $new ) = @_;
1430              
1431 56         134 my $limit = $self->length;
1432              
1433 56 50       182 $start = $limit if $start > $limit;
1434 56 50       163 $len = ( $limit - $start ) if $len > ( $limit - $start );
1435              
1436 56         208 CORE::substr( $self->{str}, $start, $len ) = $new;
1437              
1438 56         105 my $oldend = $start + $len;
1439 56         107 my $newend = $start + CORE::length( $new );
1440              
1441 56         88 my $delta = $newend - $oldend;
1442             # Positions after $oldend have now moved up $delta places
1443              
1444 56         93 my $tags = $self->{tags};
1445              
1446 56         87 my $i = 0;
1447              
1448 56         159 for( ; $i < @$tags; $i++ ) {
1449             # In this loop we'll handle tags that start before the deleted section
1450              
1451 40         68 my $t = $tags->[$i];
1452 40         132 my ( $ts, $te, undef, undef, $tf ) = @$t;
1453              
1454 40 100 100     202 last if $ts >= $start and not( $tf & FLAG_ANCHOR_BEFORE );
1455              
1456             # Two cases:
1457             # A: Tag spans entirely outside deleted section - stretch/compress it
1458             # We may have to collapse it to nothing, so delete it
1459             # B: Tag starts before but ends within deleted section - truncate it
1460             # Plus a case we don't care about
1461             # Tag starts and ends entirely before the deleted section - ignore it
1462              
1463 28 100 100     219 if( $te > $oldend or
    100 100        
1464             ( $te == $oldend and $tf & FLAG_ANCHOR_AFTER ) ) {
1465             # Case A
1466 14         54 $t->[1] += $delta;
1467              
1468 14 50       65 if( $t->[0] == $t->[1] ) {
1469 0         0 splice @$tags, $i, 1, ();
1470 0         0 $i--;
1471 0         0 next;
1472             }
1473             }
1474             elsif( $te > $start ) {
1475             # Case B
1476 1         3 $t->[1] = $start;
1477             }
1478             }
1479              
1480 56         176 for( ; $i < @$tags; $i++ ) {
1481 13         63 my $t = $tags->[$i];
1482 13         39 my ( $ts, $te ) = @$t;
1483              
1484             # In this loop we'll handle tags that start within the deleted section
1485 13 100       52 last if $ts >= $oldend;
1486              
1487             # Two cases
1488             # C: Tag contained entirely within deleted section - delete it
1489             # D: Tag starts within but ends after the deleted section - truncate it
1490              
1491 3 100       16 if( $te <= $oldend ) {
1492             # Case C
1493 2         7 splice @$tags, $i, 1;
1494 2         3 $i--;
1495 2         7 next;
1496             }
1497             else {
1498             # Case D
1499 1         3 $t->[0] = $newend;
1500 1         3 $t->[1] += $delta;
1501             }
1502             }
1503              
1504 56         139 for( ; $i < @$tags; $i++ ) {
1505 12         25 my $t = $tags->[$i];
1506 12         55 my ( $ts, $te, undef, undef, $tf ) = @$t;
1507              
1508             # In this loop we'll handle tags that start after the deleted section
1509              
1510             # One case
1511             # E: Tag starts and ends after the deleted section - move it
1512 12 100       62 $t->[0] += $delta unless $tf & FLAG_ANCHOR_BEFORE;
1513 12         47 $t->[1] += $delta;
1514              
1515             # If we've not moved the start (because it was FLAG_ANCHOR_BEFORE), we
1516             # might now have an ordering constraint violation. Better fix it.
1517 12         47 local $b = $t;
1518 12         51 foreach my $new_i ( reverse 0 .. $i-1 ) {
1519 7         14 local $a = $tags->[$new_i];
1520              
1521 7 100       27 last if _cmp_tags() <= 0;
1522              
1523 1         6 splice @$tags, $new_i, 0, splice @$tags, $i, 1, ();
1524              
1525 1         3 last;
1526             }
1527             }
1528              
1529 56 100       115 if( is_string_tagged( $new ) ) {
1530 21         67 my $atstart = $start == 0;
1531 21         56 my $atend = $newend == $self->length;
1532              
1533             $new->iter_extents( sub {
1534 21     21   50 my ( $e, $tn, $tv ) = @_;
1535 21 50 66     126 $self->apply_tag(
    100 100        
1536             ( $atstart && $e->anchor_before ) ? -1 : $start + $e->start,
1537             ( $atend && $e->anchor_after ) ? -1 : $e->length,
1538             $tn, $tv );
1539 21         253 } );
1540             }
1541              
1542 56         163 $self->_assert_sorted if DEBUG;
1543              
1544 56         222 return $self;
1545             }
1546              
1547             =head2 insert
1548              
1549             $st->insert( $start, $newstr )
1550              
1551             Insert the given string at the given position. A shortcut around
1552             C.
1553              
1554             If C<$newstr> is a C object, then its tags will be applied to
1555             C<$st> as appropriate. If C<$start> is 0, any before-anchored tags in will
1556             become before-anchored in C<$st>.
1557              
1558             =cut
1559              
1560             sub insert
1561             {
1562 15     15 1 7863 my $self = shift;
1563 15         47 my ( $at, $new ) = @_;
1564 15         78 $self->set_substr( $at, 0, $new );
1565             }
1566              
1567             =head2 append
1568              
1569             $st->append( $newstr )
1570              
1571             $st .= $newstr
1572              
1573             Append to the underlying plain string. A shortcut around C.
1574              
1575             If C<$newstr> is a C object, then its tags will be applied to
1576             C<$st> as appropriate. Any after-anchored tags in will become after-anchored
1577             in C<$st>.
1578              
1579             =cut
1580              
1581 20     20   81524 use overload '.=' => 'append';
  20         51  
  20         139  
1582              
1583             sub append
1584             {
1585 66     66 1 1465 my $self = shift;
1586 66         143 my ( $new ) = @_;
1587              
1588 66 100       131 return $self->set_substr( $self->length, 0, $new ) if is_string_tagged( $new );
1589              
1590             # Optimised version
1591 48         146 $self->{str} .= $new;
1592              
1593 48         119 my $newend = $self->length;
1594              
1595 48         83 my $tags = $self->{tags};
1596              
1597 48         88 my $i = 0;
1598              
1599             # Adjust boundaries of ANCHOR_AFTER tags
1600 48         123 for( ; $i < @$tags; $i++ ) {
1601 58         98 my $t = $tags->[$i];
1602 58 100       177 $t->[1] = $newend if $t->[4] & FLAG_ANCHOR_AFTER;
1603             }
1604              
1605 48         150 return $self;
1606             }
1607              
1608             =head2 append_tagged
1609              
1610             $st->append_tagged( $newstr, %tags )
1611              
1612             Append to the underlying plain string, and apply the given tags to the
1613             newly-inserted extent.
1614              
1615             Returns C<$st> itself so that the method may be easily chained.
1616              
1617             =cut
1618              
1619             sub append_tagged
1620             {
1621 19     19 1 52 my $self = shift;
1622 19         59 my ( $new, %tags ) = @_;
1623              
1624 19         57 my $start = $self->length;
1625 19         79 my $len = CORE::length( $new );
1626              
1627 19         59 $self->append( $new );
1628 19         98 $self->apply_tag( $start, $len, $_, $tags{$_} ) for keys %tags;
1629              
1630 19         96 return $self;
1631             }
1632              
1633             =head2 concat
1634              
1635             $ret = $st->concat( $other )
1636              
1637             $ret = $st . $other
1638              
1639             Returns a new C containing the two strings concatenated
1640             together, preserving any tags present. This method overloads normal string
1641             concatenation operator, so expressions involving C values
1642             retain their tags.
1643              
1644             This method or operator tries to respect subclassing; preferring to return a
1645             new object of a subclass if either argument or operand is a subclass of
1646             C. If they are both subclasses, it will prefer the type of the
1647             invocant or first operand.
1648              
1649             =cut
1650              
1651 20     20   6292 use overload '.' => 'concat';
  20         93  
  20         127  
1652              
1653             sub concat
1654             {
1655 17     17 1 8889 my $self = shift;
1656 17         54 my ( $other, $swap ) = @_;
1657              
1658             # Try to find the "higher" subclass
1659 17 100 100     117 my $class = ( ref $self eq __PACKAGE__ and is_string_tagged( $other ) )
1660             ? ref $other : ref $self;
1661              
1662 17         87 my $ret = $class->new( $self );
1663 17 100       118 return $ret->insert( 0, $other ) if $swap;
1664 6         24 return $ret->append( $other );
1665             }
1666              
1667             =head2 matches
1668              
1669             @subs = $st->matches( $regexp )
1670              
1671             Returns a list of substrings (as C instances) for every
1672             non-overlapping match of the given C<$regexp>.
1673              
1674             This could be used, for example, to build a formatted string from a formatted
1675             template containing variable expansions:
1676              
1677             my $template = ...
1678             my %vars = ...
1679              
1680             my $ret = String::Tagged->new;
1681             foreach my $m ( $template->matches( qr/\$\w+|[^$]+/ ) ) {
1682             if( $m =~ m/^\$(\w+)$/ ) {
1683             $ret->append_tagged( $vars{$1}, %{ $m->get_tags_at( 0 ) } );
1684             }
1685             else {
1686             $ret->append( $m );
1687             }
1688             }
1689              
1690             This iterates segments of the template containing variables expansions
1691             starting with a C<$> symbol, and replaces them with values from the C<%vars>
1692             hash, careful to preserve all the formatting tags from the original template
1693             string.
1694              
1695             =cut
1696              
1697             sub matches
1698             {
1699 1     1 1 9 my $self = shift;
1700 1         3 my ( $re ) = @_;
1701              
1702 1         3 my $plain = $self->str;
1703              
1704 1         2 my @ret;
1705 1         9 while( $plain =~ m/$re/g ) {
1706 5         24 push @ret, $self->substr( $-[0], $+[0] - $-[0] );
1707             }
1708              
1709 1         6 return @ret;
1710             }
1711              
1712             =head2 match_extents
1713              
1714             @extents = $st->match_extents( $regexp )
1715              
1716             I
1717              
1718             Returns a list of extent objects for every non-overlapping match of the given
1719             C<$regexp>. This is similar to L, except that the results are
1720             returned as extent objects instead of substrings, allowing access to the
1721             position information as well.
1722              
1723             If using the result of this method to find regions of a string to modify,
1724             remember that any length alterations will not update positions in later extent
1725             objects. However, since the extents are non-overlapping and in position order,
1726             this can be handled by iterating them in reverse order so that the
1727             modifications done first are later in the string.
1728              
1729             foreach my $e ( reverse $st->match_extents( $pattern ) ) {
1730             $st->set_substr( $e->start, $e->length, $replacement );
1731             }
1732              
1733             =cut
1734              
1735             sub match_extents
1736             {
1737 2     2 1 9287 my $self = shift;
1738 2         5 my ( $re ) = @_;
1739              
1740 2         5 my $plain = $self->str;
1741              
1742 2         4 my @ret;
1743 2         29 while( $plain =~ m/$re/g ) {
1744 9         31 push @ret, $self->_mkextent( $-[0], $+[0], 0 );
1745             }
1746              
1747 2         9 return @ret;
1748             }
1749              
1750             =head2 split
1751              
1752             @parts = $st->split( $regexp, $limit )
1753              
1754             Returns a list of substrings by applying the regexp to the string content;
1755             similar to the core perl C function. If C<$limit> is supplied, the
1756             method will stop at that number of elements, returning the entire remainder of
1757             the input string as the final element. If the C<$regexp> contains a capture
1758             group then the content of the first one will be added to the return list as
1759             well.
1760              
1761             =cut
1762              
1763             sub split
1764             {
1765 4     4 1 34 my $self = shift;
1766 4         8 my ( $re, $limit ) = @_;
1767              
1768 4         7 my $plain = $self->str;
1769              
1770 4         7 my $prev = 0;
1771 4         9 my @ret;
1772 4         43 while( $plain =~ m/$re/g ) {
1773 5         29 push @ret, $self->substr( $prev, $-[0]-$prev );
1774 5 100       28 push @ret, $self->substr( $-[1], $+[1]-$-[1] ) if @- > 1;
1775              
1776 5         13 $prev = $+[0];
1777              
1778 5 100 66     32 last if defined $limit and @ret == $limit-1;
1779             }
1780              
1781 4 100       13 if( CORE::length $plain > $prev ) {
1782 3         8 push @ret, $self->substr( $prev, CORE::length( $plain ) - $prev );
1783             }
1784              
1785 4         17 return @ret;
1786             }
1787              
1788             =head2 sprintf
1789              
1790             $ret = $st->sprintf( @args )
1791              
1792             I
1793              
1794             Returns a new string by using the given instance as the format string for a
1795             L constructor call. The returned instance will be of the same
1796             class as the invocant.
1797              
1798             =cut
1799              
1800             sub sprintf
1801             {
1802 1     1 1 2 my $self = shift;
1803              
1804 1         4 return ( ref $self )->from_sprintf( $self, @_ );
1805             }
1806              
1807             =head2 debug_sprintf
1808              
1809             $ret = $st->debug_sprintf
1810              
1811             Returns a representation of the string data and all the tags, suitable for
1812             debug printing or other similar use. This is a format such as is given in the
1813             DESCRIPTION section above.
1814              
1815             The output will consist of a number of lines, the first containing the plain
1816             underlying string, then one line per tag. The line shows the extent of the tag
1817             given by C<[---]> markers, or a C<|> in the special case of a tag covering
1818             only a single character. Special markings of C> and C> indicate
1819             tags which are "before" or "after" anchored.
1820              
1821             For example:
1822              
1823             Hello, world
1824             [---] word => 1
1825             <[----------]> everywhere => 1
1826             | space => 1
1827              
1828             =cut
1829              
1830             sub debug_sprintf
1831             {
1832 5     5 1 777 my $self = shift;
1833              
1834 5         18 my $str = $self->str;
1835 5         11 my $len = CORE::length( $str );
1836              
1837 5         9 my $maxnamelen = 0;
1838              
1839 5         23 my $ret = " " . ( $str =~ s/\n/./gr ) . "\n";
1840              
1841             $self->iter_tags( sub {
1842 7     7   20 my ( undef, undef, $name, undef ) = @_;
1843 7 100       24 CORE::length( $name ) > $maxnamelen and $maxnamelen = CORE::length( $name );
1844 5         32 } );
1845              
1846 5         29 foreach my $t ( @{ $self->{tags} } ) {
  5         20  
1847 7         30 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
1848              
1849 7 100       26 $ret .= ( $tf & FLAG_ANCHOR_BEFORE ) ? " <" : " ";
1850              
1851 7         17 $ret .= " " x $ts;
1852              
1853 7         10 my $tl = $te - $ts;
1854              
1855 7 100       22 if( $tl == 0 ) {
    100          
1856 1         9 $ret =~ s/ $/>
1857 1         2 $te++; # account for extra printed width
1858             }
1859             elsif( $tl == 1 ) {
1860 2         4 $ret .= "|";
1861             }
1862             else {
1863 4         11 $ret .= "[" . ( "-" x ( $tl - 2 ) ) . "]";
1864             }
1865              
1866 7         14 $ret .= " " x ( $len - $te );
1867              
1868 7 100       16 $ret .= ( $tf & FLAG_ANCHOR_AFTER ) ? "> " : " ";
1869              
1870 7         34 $ret .= CORE::sprintf "%-*s => %s\n", $maxnamelen, $tn, $tv;
1871             }
1872              
1873 5         26 return $ret;
1874             }
1875              
1876             =head1 TODO
1877              
1878             =over 4
1879              
1880             =item *
1881              
1882             There are likely variations on the rules for C that could equally
1883             apply to some uses of tagged strings. Consider whether the behaviour of
1884             modification is chosen per-method, per-tag, or per-string.
1885              
1886             =item *
1887              
1888             Consider how to implement a clone from one tag format to another which wants
1889             to merge multiple different source tags together into a single new one.
1890              
1891             =back
1892              
1893             =head1 AUTHOR
1894              
1895             Paul Evans
1896              
1897             =cut
1898              
1899             0x55AA;