File Coverage

blib/lib/Data/Random/Structure/UTF8.pm
Criterion Covered Total %
statement 97 110 88.1
branch 45 60 75.0
condition 16 18 88.8
subroutine 15 15 100.0
pod 6 6 100.0
total 179 209 85.6


line stmt bran cond sub pod time code
1             package Data::Random::Structure::UTF8;
2              
3 4     4   317351 use 5.8.0;
  4         46  
4 4     4   24 use strict;
  4         9  
  4         81  
5 4     4   31 use warnings;
  4         9  
  4         216  
6              
7             our $VERSION='0.06';
8              
9 4     4   2004 use parent 'Data::Random::Structure';
  4         1213  
  4         22  
10              
11 4     4   70460 use Scalar::Util qw( looks_like_number );
  4         11  
  4         5201  
12              
13             sub new {
14 5     5 1 4504 my $class = shift;
15 5         24 my %options = @_;
16 5         11 my $only_unicode = 0;
17 5 100       22 if( exists $options{'only-unicode'} ){
18 3 50       9 if( defined $options{'only-unicode'} ){
19 3         9 $only_unicode = $options{'only-unicode'}
20             }
21             # do not pass our options to parent it may get confused and croak
22 3         7 delete $options{'only-unicode'}
23             }
24 5         41 my $self = $class->SUPER::new(%options);
25             # at this point our _init() will be called via parent's
26             # constructor. Our _init() will call parent's _init()
27 5         38 $self->only_unicode($only_unicode);
28 5         32 return $self
29             }
30             sub _reset {
31 12     12   22 my $self = shift;
32             # we are interfering with the internals of the parent... not good
33 12         21 $#{$self->{_types}} = -1;
  12         42  
34 12         31 $#{$self->{_scalar_types}} = -1;
  12         31  
35             }
36             sub _init {
37 12     12   112 my $self = shift;
38 12         40 $self->_reset();
39 12         53 $self->SUPER::_init(@_);
40 12         130 push @{$self->{_scalar_types}}, 'string-UTF8'
  12         33  
41             }
42             sub only_unicode {
43 14     14 1 3586 my $self = $_[0];
44 14         32 my $m = $_[1];
45 14 100       79 return $self->{'_only-unicode'} unless defined $m;
46 7         29 $self->_init();
47 7         31 $self->{'_only-unicode'} = $m;
48 7 100       34 if( $m == 1 ){
    100          
49             # delete just the 'string' type
50             # we will get various types but the strings will
51             # be exclusively unicode
52 5         13 my @idx = grep { $self->{'_scalar_types'}->[$_] eq 'string' }
53 1         3 reverse 0 .. $#{$self->{_scalar_types}}
  1         4  
54             ;
55 1         12 splice(@{$self->{_scalar_types}}, $_, 1) for @idx;
  1         5  
56             } elsif( $m > 1 ){
57             # delete ALL the _scalar_types and leave just our unicode string
58             # we will get only unicode strings no other scalar type
59 2         4 $#{$self->{_scalar_types}} = -1;
  2         7  
60 2         4 push @{$self->{_scalar_types}}, 'string-UTF8'
  2         4  
61             }
62 7         16 return $m
63             }
64             sub random_char_UTF8 {
65             # the crucial part borrowed from The Effective Perler:
66             # https://www.effectiveperlprogramming.com/2018/08/find-the-new-emojis-in-perls-unicode-support/
67             # my $achar;
68             # for(my $trials=100;$trials-->0;){
69             # $achar = chr(int(rand(0x10FFF+1)));
70             # return $achar if $achar =~ /\p{Present_In: 8.0}/;
71             # }
72              
73             # just greek and coptic no holes
74 17822     17822 1 50354 return chr(0x03B0+int(rand(0x03F0-0x03B0)));
75              
76 0         0 my $arand = rand();
77 0 0       0 if( $arand < 0.2 ){
    0          
    0          
    0          
78 0         0 return chr(0x03B0+int(rand(0x03F0-0x03B0)))
79             } elsif( $arand < 0.4 ){
80 0         0 return chr(0x0400+int(rand(0x040F-0x0400)))
81             } elsif( $arand < 0.6 ){
82 0         0 return chr(0x13A0+int(rand(0x13EF-0x13A0)))
83             } elsif( $arand < 0.8 ){
84 0         0 return chr(0x1200+int(rand(0x137F-0x1200)))
85             }
86 0         0 return chr(0xA980+int(rand(0xA9DF-0xA980)))
87             }
88             sub random_chars_UTF8 {
89 8911     8911 1 22519 my %options = @_;
90 8911 50       19554 my $minl = defined($options{'min'}) ? $options{'min'} : 6;
91 8911 50       16525 my $maxl = defined($options{'max'}) ? $options{'max'} : 32;
92 8911         14077 my $ret = "";
93 8911         27067 for(1..($minl+int(rand($maxl-$minl)))){
94 17822         33027 $ret .= random_char_UTF8()
95             }
96 8911         36190 return $ret;
97             }
98             # override's parent's.
99             # first call parent's namesake and if it fails because it
100             # is decided to generate UTF8 something, it will default to
101             # this method which must deal with all the extenstions we introduced
102             # in our own _init()
103             # CAVEAT: it relies on parent croaking the message
104             # "I don't know how to generate $type\n"
105             # if that chanegs (in parent) then we will no longer be able to deduce
106             # $type and have to change this program.
107             # if that happens please file a bug.
108             # unfortunately our parent class does not allow for input params...
109             sub generate_scalar {
110 19431     19431 1 98157 my $self = shift;
111 19431         30802 my $rc = eval { $self->SUPER::generate_scalar(@_) };
  19431         41814  
112 19431 100 66     1839856 if( $@ || ! defined($rc) ){
113 8911 50       54310 if( $@ !~ /how to generate (.+?)\R/ ){
114 0         0 warn "something changed in parent class and can not parse this message any more, please file a bug: '$@'";
115 0         0 return scalar(random_chars_UTF8(min=>2,max=>2));
116             }
117 8911         20691 my $type = $1;
118 8911 50       19050 if( $type eq 'string-UTF8' ){
119 8911         19704 return scalar(random_chars_UTF8(min=>2,max=>2));
120             } else {
121 0         0 warn "child: I don't know how to generate $type, this is a bug, please file a bug and mention this: $@\n";
122             # but don't die
123 0         0 return scalar(random_chars_UTF8(min=>2,max=>2));
124             }
125             }
126 10520         32949 return $rc
127             }
128             sub check_content_recursively {
129 1958     1958 1 21561 my $looking_for = $_[1]; # a hashref of types to look-for, required
130 1958         2902 my $bitparams = 0;
131 1958 100 100     7053 $bitparams |= 1 if exists($looking_for->{'numbers'}) && ($looking_for->{'numbers'}==1);
132 1958 100 100     5196 $bitparams |= 2 if exists($looking_for->{'strings-unicode'}) && ($looking_for->{'strings-unicode'}==1);
133 1958 100 100     4896 $bitparams |= 4 if exists($looking_for->{'strings-plain'}) && ($looking_for->{'strings-plain'}==1);
134 1958 100 100     4592 $bitparams |= (2+4) if exists($looking_for->{'strings'}) && ($looking_for->{'strings'}==1);
135 1958         3657 return _check_content_recursively($_[0], $bitparams);
136             }
137             # returns 1 if we are looking for it and it was found
138             # returns 0 if what we were looking for was not found.
139             # 'looking_for' can be more than one things.
140             # it is a bit string, 1st bit if set looks for numbers,
141             # 2nd bit, if set, looks for unicode strings,
142             # 3rd bit, if set, looks for non-unicode strings (plain)
143             # if you set 'numbers'=>0, it simply means "do not check for numbers"
144             # and so it will not check if it has any numbers
145             # by giving nothing to check, it return 0, nothing was found
146             sub _check_content_recursively {
147 22565     22565   32824 my $inp = $_[0];
148             # NUMBER,UNICODE_STRING,NON_UNICODE_STRING
149 22565         29841 my $looking_for = $_[1];
150 22565         32712 my $aref = ref($inp);
151 22565         31924 my ($r, $v);
152 22565 100 66     50944 if( ($aref eq '') || ($aref eq 'SCALAR') ){
    100          
    50          
153 20099 50       34748 if( $aref eq 'SCALAR' ){ $inp = $$inp }
  0         0  
154 20099 100       42952 if( looks_like_number($inp) ){
155 5381 100       9761 return 1 if $looking_for & 1; # a number
156 4979         8481 return 0;
157             }
158 14718 100       22733 if( _has_utf8($inp) ){
159 13939 100       26440 return 1 if $looking_for & 2; # unicode string
160 13337         25295 return 0;
161             }
162 779 100       1989 return 1 if $looking_for & 4; # plain string
163 491         985 return 0;
164             } elsif( $aref eq 'ARRAY' ){
165 1243         2322 for my $v (@$inp){
166 7915         12574 $r = _check_content_recursively($v, $looking_for);
167 7915 100       16493 return 1 if $r;
168             }
169             } elsif( $aref eq 'HASH' ){
170 1223         6795 for my $k (sort keys %$inp){
171 6535         10936 $r = _check_content_recursively($k, $looking_for);
172 6535 100       12365 return 1 if $r;
173 6157         10477 $r = _check_content_recursively($inp->{$k}, $looking_for);
174 6157 100       13884 return 1 if $r;
175             }
176 0         0 } else { die "don't know how to deal with this ref '$aref'" }
177             }
178 14718     14718   43065 sub _has_utf8 { return $_[0] =~ /[^\x00-\x7f]/ }
179             # this does not work for unicode strings
180             # from https://www.perlmonks.org/?node_id=958679
181             # and https://www.perlmonks.org/?node_id=791677
182             #sub isnum ($) {
183             # return 0 if $_[0] eq '';
184             # $_[0] & ~$_[0] ? 0 : 1
185             #}
186             1;
187              
188             =pod
189              
190             =encoding utf8
191              
192             =head1 NAME
193              
194             Data::Random::Structure::UTF8 - Produce nested data structures with unicode keys, values, elements.
195              
196             =head1 VERSION
197              
198             Version 0.06
199              
200             =head1 SYNOPSIS
201              
202             This module produces random, arbitrarily deep and long,
203             nested Perl data structures with unicode content for the
204             keys, values and/or array elements. Content can be forced
205             to be exclusively strings and exclusively unicode. Or
206             the strings can be unicode. Or anything goes, mixed
207             unicode and non-unicode strings as well as integers, floats, etc.
208              
209             This is an object-oriented module
210             which inherits from
211             L and extends its functionality by
212             providing for unicode keys and values for hashtables and
213             unicode content for array elements or scalars, randomly mixed with the
214             usual repertoire of L, which is
215             non-unicode strings,
216             numerical, boolean values and the assorted entourage to the court
217             of Emperor Computer, post-Turing.
218              
219             For example, it produces these:
220              
221             =over 4
222              
223             =item * unicode scalars: e.g. C<"αβγ">,
224              
225             =item * mixed arrays: e.g. C<["αβγ", "123", "xyz"]>
226              
227             =item * hashtables with some/all keys and/or values as unicode: e.g.
228             C<{"αβγ" => "123", "xyz" => "αβγ"}>
229              
230             =item * exclusive unicode arrays or hashtables: e.g. C<["αβγ", "χψζ"]>
231              
232             =back
233              
234             This is accomplised by adding an extra
235             type C (invisible to the user) and the
236             respective generator method. All these are invisible to the user
237             which will get the old functionality plus some (or maybe none
238             because this is a random process which does not eliminate non-unicode
239             strings, at the moment) unicode strings.
240              
241             use Data::Random::Structure::UTF8;
242              
243             my $randomiser = Data::Random::Structure::UTF8->new(
244             'max_depth' => 5,
245             'max_elements' => 20,
246             # all the strings produced (keys, values, elements)
247             # will be unicode strings
248             'only-unicode' => 1,
249             # all the strings produced (keys, values, elements)
250             # will be a mixture of unicode and non-unicode
251             # this is the default behaviour
252             #'only-unicode' => 0,
253             # only unicode strings will be produced for (keys, values, elements),
254             # there will be no numbers, no bool, only unicode strings
255             #'only-unicode' => 2,
256             );
257             my $perl_var = $randomiser->generate() or die;
258             print pp($perl_var);
259              
260             # which prints the usual escape mess of Dump and Dumper
261             [
262             "\x{7D5A}\x{4EC1}",
263             "\x{E6E2}\x{75A4}",
264             329076,
265             0.255759160148987,
266             [
267             "TEb97qJt",
268             1,
269             "_ow|J\@~=6%*N;52?W3Y\$S1",
270             {
271             "x{75A4}x{75A4}" => 123,
272             "123" => "\x{7D5A}\x{4EC1}",
273             "xyz" => [1, 2, "\x{7D5A}\x{4EC1}"],
274             },
275             ],
276              
277             # can control the scalar type (for keys, values, items) on the fly
278             # this produces unicode strings in addition to
279             # Data::Random::Structure's usual repertoire:
280             # non-unicode-string, numbers, bool, integer, float, etc.
281             # (see there for the list)
282             $randomiser->only_unicode(0); # the default: anything plus unicode strings
283             print $randomiser->only_unicode();
284              
285             # this produces unicode strings in addition to
286             # Data::Random::Structure's usual repertoire:
287             # numbers, bool, integer, float, etc.
288             # (see there for the list)
289             # EXCEPT non-unicode-strings, (all strings will be unicode)
290             $randomiser->only_unicode(1);
291             print $randomiser->only_unicode();
292              
293             # this produces unicode strings ONLY
294             # Data::Random::Structure's usual repertoire does not apply
295             # there will be no numbers, no bool, no integer, no float, no nothing
296             $randomiser->only_unicode(2);
297             print $randomiser->only_unicode();
298              
299             =head1 METHODS
300              
301             This is an object oriented module which has exactly the same API as
302             L.
303              
304             =head2 C
305              
306             Constructor. In addition to L C<>
307             API, it takes parameter C<< 'only-unicode' >> with
308             a valid value of 0, 1 or 2. Default is 0.
309              
310             =over 4
311              
312             =item * 0 : keys, values, elements of the produced data structure will be
313             a mixture of unicode strings, plus L's full
314             repertoire which includes non-unicode strings, integers, floats etc.
315              
316             =item * 1 : keys, values, elements of the produced data structure will be
317             a mixture of unicode strings, plus L's full
318             repertoire except non-unicode strings. That is, all strings will be
319             unicode. But there will possibly be integers etc.
320              
321             =item * 2 : keys, values, elements of the produced data structure will be
322             only unicode strings. Nothing of L's
323             repertoire applies. Only unicode strings, no integers, no nothing.
324              
325             =back
326              
327             Controlling the scalar data types can also be done on the fly, after
328             the object has been created using
329             L C<>
330             method.
331              
332             Additionally, L C<>'s API reports that
333             the constructor takes 2 optional arguments, C and C.
334             See L C<> for up-to-date, official information.
335              
336             =head2 C
337              
338             Controls what scalar types to be included in the nested
339             data structures generated. With no parameters it returns back
340             the current setting. Otherwise, valid input parameters and their
341             meanings are listed in L C<>
342              
343             =head2 C
344              
345             Generate a nested data structure according to the specification
346             set in the constructor. See L C<> for
347             all options. This method is not overriden by this module.
348              
349             It returns the Perl data structure as a reference.
350              
351             =head2 C
352              
353             Generate a scalar which may contain unicode content.
354             See L for
355             all options. This method is overriden by this module but
356             calls the parent's too.
357              
358             It returns a Perl string.
359              
360             =head2 C
361              
362             Generate an array with random, possibly unicode, content.
363             See L for
364             all options. This method is not overriden by this module.
365              
366             It returns the Perl array as a reference.
367              
368             =head2 C
369              
370             Generate an array with random, possibly unicode, content.
371             See L for
372             all options. This method is not overriden by this module.
373              
374             It returns the Perl array as a reference.
375              
376             =head2 C
377              
378             Return a random unicode character, guaranteed to be valid.
379             This is a very simple method which selects characters
380             from some pre-set code pages (Greek, Cyrillic, Cherokee,
381             Ethiopic, Javanese) with equal probability.
382             These pages and ranges were selected so that there are
383             no "holes" between them which would produce an invalid
384             character. Therefore, not all characters from the
385             particular code page will be produced.
386              
387             Returns a random unicode character guaranteed to be valid.
388              
389             =head2 C
390              
391             my $ret = random_chars_UTF8($optional_paramshash)
392              
393             Arguments:
394              
395             =over 4
396              
397             =item * C<$optional_paramshash> : can contain
398              
399             =over 4
400              
401             =item * C<'min'> sets the minimum length of the random sequence to be returned, default is 6
402              
403             =item * C<'max'> sets the maximum length of the random sequence to be returned, default is 32
404              
405             =back
406              
407             =back
408              
409             Return a random unicode-only string optionally specifying
410             minimum and maximum length. See
411             L C<>
412             for the range of characters it returns. The returned string
413             is unicode and is guaranteed all its characters are valid.
414              
415             =head1 SUBROUTINES
416              
417             =head2 C
418              
419             my $ret = check_content_recursively($perl_var, $paramshashref)
420              
421             Arguments:
422              
423             =over 4
424              
425             =item * C<$perl_var> : a Perl variable containing an arbitrarily nested data structure
426              
427             =item * C<$paramshashref> : can contain one or more of the following keys:
428              
429             =over 4
430              
431             =item * C<'numbers'> set it to 1 to look for numbers (possibly among other things).
432             If set to 1 and a number C<123> or C<"123"> is found, this sub returns 1.
433             Set it to 0 to not look for numbers at all (and not report if
434             there are no numbers) - I, that's what
435             setting this to zero means.
436              
437             =item * C<'strings-unicode'> set it to 1 to look for unicode strings (possibly among other things).
438             The definition of "unicode string" is that at least one its characters is unicode.
439             If set to 1 and a "unicode string" is found, this sub returns 1.
440              
441             =item * C<'strings-plain'> set it to 1 to look for plain strings (possibly among other things).
442             The definition of "plain string" is that none of its characters is unicode.
443             If set to 1 and a "plain string" is found, this sub returns 1.
444              
445             =item * C<'strings'> set it to 1 to look for plain or unicode strings (possibly among other things).
446             If set to 1 and a "plain string" or "unicode string" is found, this sub returns 1. Basically,
447             it returns 1 when a string is found (as opposed to a "number").
448              
449             =back
450              
451             =back
452              
453             In general, by setting C<<'strings-unicode'=>1>> you are checking whether
454             the input Perl variable contains a unicode string in a key, a value,
455             an array element, or a scalar reference.
456              
457             But, setting C<<'strings-unicode'=>0>>, it simply means do not look for
458             this. It does not mean I.
459              
460             Return value: 1 or 0 depending whether what
461             was looking for, was found.
462              
463             This is not an object-oriented method. It is called thously:
464              
465             # check if ANY scalar (hash key, value, array element or scalar ref)
466             # contains ONLY single number (integer, float)
467             # the decicion is made by Scalar::Util:looks_like_number()
468             if( Data::Random::Structure::UTF8::check_content_recursively(
469             {'abc'=>123, 'xyz'=>[1,2,3]},
470             {
471             # look for numbers, are there any?
472             'numbers' => 1,
473             }
474             ) ){ print "data structure contains numbers\n" }
475              
476             # check if it contains no numbers but it does unicode strings
477             if( Data::Random::Structure::UTF8::check_content_recursively(
478             {'abc'=>123, 'xyz'=>[1,2,3]},
479             {
480             # don't look for numbers
481             'numbers' => 0,
482             # look for unicode strings, are there any?
483             'strings-unicode' => 1,
484             }
485             ) ){ print "data structure contains numbers\n" }
486              
487             CAVEAT: as its name suggests, this is a recursive function. Beware
488             of extremely deep data structures. Deep, not long. If you do get
489             C<<"Deep recursion..." warnings>>, and you do insist to go ahead,
490             this will remove the warnings (but are you sure?):
491              
492             {
493             no warnings 'recursion';
494             if( Data::Random::Structure::UTF8::check_content_recursively(
495             {'abc'=>123, 'xyz'=>[1,2,3]},
496             {
497             'numbers' => 1,
498             }
499             ) ){ print "data structure contains numbers\n" }
500             }
501              
502             =head1 SEE ALSO
503              
504             =over 4
505              
506             =item * The parent class L.
507              
508             =item * L for stringifying possibly-unicode Perl data structures.
509              
510             =back
511              
512             =head1 AUTHOR
513              
514             Andreas Hadjiprocopis, C<< >>
515              
516             =head1 BUGS
517              
518             Please report any bugs or feature requests to C, or through
519             the web interface at L. I will be notified, and then you'll
520             automatically be notified of progress on your bug as I make changes.
521              
522             =head1 CAVEATS
523              
524             There are two issues users should know about.
525              
526             The first issue is that the unicode produced can make
527             L to complain with
528              
529             Operation "lc" returns its argument for UTF-16 surrogate U+DA4B at /usr/local/share/perl5/Data/Dump.pm line 302.
530              
531             This, I have found, can be fixed with the following workaround (from L, thank you):
532              
533             # Suppress `Operation "lc" returns its argument for UTF-16 surrogate 0xNNNN` warning
534             # for the `lc()` call below; use 'utf8' instead of a more appropriate 'surrogate' pragma
535             # since the latter is not available in until Perl 5.14
536             no warnings 'utf8';
537              
538             The second issue is that this class inherits from L
539             and relies on it complaining about not being able to handle certain types
540             which are our own extensions (the C extension). We have
541             no way to know that except from catching its C'ing and parsing it
542             with the following code
543              
544             my $rc = eval { $self->SUPER::generate_scalar(@_) };
545             if( $@ || ! defined($rc) ){
546             # parent doesn't know what to do, can we handle this?
547             if( $@ !~ /how to generate (.+?)\R/ ){ ... ... }
548             else { print "type is $1" }
549             ...
550              
551             in order to extract the C which can not be handled
552             and handle it ourselves. So whenever the parent class (L)
553             changes its C song, we will have to adopt this code
554             accordingly (in L C<>).
555             For the moment, I have placed a catch-all, fall-back condition
556             to handle this but it will be called for all kind of types
557             and not only the types we have added.
558              
559             So, this issue is not going to make the module die but may make it
560             to skew the random results in favour of unicode strings (which
561             is the fallback, default action when can't parse the type).
562              
563             =head1 SUPPORT
564              
565             You can find documentation for this module with the perldoc command.
566              
567             perldoc Data::Random::Structure::UTF8
568              
569              
570             You can also look for information at:
571              
572             =over 4
573              
574             =item * RT: CPAN's request tracker (report bugs here)
575              
576             L
577              
578             =item * AnnoCPAN: Annotated CPAN documentation
579              
580             L
581              
582             =item * CPAN Ratings
583              
584             L
585              
586             =item * Search CPAN
587              
588             L
589              
590             =back
591              
592             =head1 SEE ALSO
593              
594             =over 4
595              
596             =item * L
597              
598             =back
599              
600             =head1 ACKNOWLEDGEMENTS
601              
602             Mark Allen who created L which is our parent class.
603              
604             =head1 DEDICATIONS AND HUGS
605              
606             !Almaz!
607              
608             =head1 LICENSE AND COPYRIGHT
609              
610             This software is Copyright (c) 2020 by Andreas Hadjiprocopis.
611              
612             This is free software, licensed under:
613              
614             The Artistic License 2.0 (GPL Compatible)
615              
616             =cut