File Coverage

blib/lib/String/Copyright.pm
Criterion Covered Total %
statement 78 88 88.6
branch 26 38 68.4
condition 25 40 62.5
subroutine 17 19 89.4
pod 0 2 0.0
total 146 187 78.0


line stmt bran cond sub pod time code
1 7     7   1634109 use 5.008001;
  7         60  
2 7     7   40 use strict;
  7         13  
  7         144  
3 7     7   34 use warnings;
  7         13  
  7         190  
4 7     7   39 use utf8;
  7         15  
  7         47  
5 7     7   255 use re (qw/eval/);
  7         14  
  7         655  
6              
7             my $CAN_RE2;
8              
9             BEGIN {
10 7     7   25 eval { require re::engine::RE2 };
  7         937  
11 7 50       737 $CAN_RE2 = $@ ? '' : 1;
12             }
13              
14             package String::Copyright;
15              
16             =encoding UTF-8
17              
18             =head1 NAME
19              
20             String::Copyright - Representation of text-based copyright statements
21              
22             =head1 VERSION
23              
24             Version 0.003012
25              
26             =cut
27              
28             our $VERSION = '0.003012';
29              
30             # Dependencies
31 7     7   2759 use parent 'Exporter::Tiny';
  7         2661  
  7         43  
32 7     7   27724 use Carp ();
  7         20  
  7         148  
33 7     7   4288 use Set::IntSpan;
  7         71630  
  7         624  
34              
35             our @EXPORT = qw/copyright/;
36              
37             use constant {
38 7         659 PLAINTEXT => 0,
39             BLOCKS => 1,
40             FORMAT => 2,
41 7     7   74 };
  7         17  
42              
43             use overload (
44 7         48 q{""} => '_compose',
45             fallback => 1,
46 7     7   47 );
  7         15  
47              
48             =head1 SYNOPSIS
49              
50             use String::Copyright;
51              
52             my $copyright = copyright(<<'END');
53             copr. © 1999,2000 Foo Barbaz and Acme Corp.
54             Copyright (c) 2001,2004 Foo (work address)
55             Copyright 2003, Foo B. and friends
56             © 2000, 2002 Foo Barbaz
57             END
58              
59             print $copyright;
60              
61             # Copyright 1999-2000 Foo Barbaz and Acme Corp.
62             # Copyright 2000, 2002 Foo Barbaz and Acme Corp.
63             # Copyright 2001, 2004 Foo (work address)
64             # Copyright 2003 Foo B. and friends
65              
66             =head1 DESCRIPTION
67              
68             L Parses common styles of copyright statements
69             and serializes in normalized format.
70              
71             =head1 OPTIONS
72              
73             Options can be set as an argument to the 'use' statement.
74              
75             =head2 threshold, threshold_before, threshold_after
76              
77             use String::Copyright { threshold_after => 5 };
78              
79             Stop parsing after this many lines whithout copyright information,
80             before or after having found any copyright information at all.
81             C sets both C and C.
82              
83             By default unset: All lines are parsed.
84              
85             =head2 format( \&sub )
86              
87             use String::Copyright { format => \&GNU_style } };
88              
89             sub GNU_style {
90             my ( $years, $owners ) = @_;
91              
92             return 'Copyright (C) ' . join ' ', $years || '', $owners || '';
93             }
94              
95             =head1 FUNCTIONS
96              
97             Exports one function: C.
98             This module uses L to export functions,
99             which allows for flexible import options;
100             see the L documentation for details.
101              
102             =cut
103              
104             # OR'ed strings have regular variable name and are already grouped
105             # AND'ed strings have name ending in underscore: must be grouped if repeated
106             my $blank = '[ \t]';
107             my $blank_or_break_ = "$blank*\\n?$blank*";
108             my $dash = '[-˗‐‑‒–—―⁃−﹣-]';
109             my $colons_ = "$blank?:{1,2}";
110             my $label = '(?i:copyright(?:-holders?)?\b|copr\.)';
111             my $sign = '[©⒞Ⓒⓒ🄒🄫🅒]';
112             my $nroff_sign_ = '\\\\[(]co';
113             my $pseudo_sign_ = '[({][Cc][})]';
114             my $vague_sign_ = '-[Cc]-';
115             my $broken_sign_ = "\\?$blank*";
116              
117             # high-bit © noise, caused by misparsing UTF-8 as latin1
118             # except \xAE (latin1 ©), \xAE (MacRoman ©), \xE2 (latin1 © lowercased after misparse)
119             my $nonsign_ = '[\x80-\xAB\xAD-\xC1\xC3-\xE1\xE3-\xFF]\xA9';
120             my $nonidentifier_
121             = "(?:no |_|$dash)copyright|copyright-[^h]|(?:Digital Millennium|U.S.|US|United States) Copyright Act|\\b(?:for|we) copyright\\b";
122              
123             # this should cause *no* false positives, and stop-chars therefore
124             # exclude e.g. email address building blocks; tested against the code
125             # corpus at https://codesearch.debian.net/ (tricky: its RE2 engine lacks
126             # support for negative groups) using searches like these:
127             # (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\w
128             # (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\b[-_@]
129             # (?im)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)[^ $]
130             my $identifier_action
131             = '(?i:apply|applied|applies|assigned|generated|transfer|transferred)';
132             my $identifier_thing_
133             = '(?i:block|claim|date|disclaimer|holder|info|information|interest|law|license|notice|owner|ownership|permission|sign|statement|string|symbol|tag|text)s?';
134             my $identifier_misc
135             = "(?i:and|are|at|eq|for|if|in|is|of|on|or|,${blank}patent|this|to|the (?:library|software),|treaty)";
136             my $identifier_chatter
137             = "(?:$identifier_action|$identifier_thing_|$identifier_misc)";
138             my $the_notname
139             = '(?i:concrete|fault|first|immediately|least|min\/max|one|outer|previous|ratio|sum|user)';
140             my $the_sentence_
141             = "(?:\\w+$blank+){1,10}(?i:are|can(?:not)?|in|is|must|was)";
142             my $pseudosign_chatter_
143             = "(?:(?:the$blank+(?:$the_notname|$the_sentence_)|all begin|there|you must)\\b|,? \\(?\\w\\))";
144             my $chatter
145             = "(?im:$nonsign_|$nonidentifier_|copyright$blank_or_break_$identifier_chatter(?:\\z|@\\W|[^a-zA-Z0-9@_-])|$blank*$pseudo_sign_(?:$blank_or_break_)+$pseudosign_chatter_)";
146             my $nonyears = '(?:?|19xx|19yy|yyyy|YEAR)';
147              
148             my $year_ = '\b[0-9]{4}\b';
149             my $comma_spacy = "(?:$blank*,$blank_or_break_|$blank_or_break_,?$blank*)";
150             my $dash_spacy_ = "$blank*$dash(?:$blank_or_break_)*";
151              
152             my $colon_or_dash = "(?:$colons_$blank_or_break_|$blank?$dash\{1,2}$blank)";
153             my $delimiter = "(?:$colon_or_dash|$comma_spacy)";
154              
155             my $vague_year_ = "(?:$dash$blank?)?[0-9]{1,5}";
156             my $owner_intro_
157             = "(?:$colon_or_dash|$pseudo_sign_$blank?|\\bby$blank_or_break_)";
158             my $owner_prefix = '[(*<@\[{]';
159             my $owner_initial = '[^\s!"#$%&\'()*+,./:;<=>?@[\\\\\]^_`{|}~-]';
160              
161             my $signs
162             = "(?m:(?:$label|$sign|$nroff_sign_|(?:^|$blank)$pseudo_sign_)(?:$colon_or_dash?$blank*(?:$label|$sign|$pseudo_sign_))*)";
163             my $yearspan_ = "$year_(?:$dash_spacy_$year_)?";
164             my $years_ = "$yearspan_(?:$comma_spacy$yearspan_)*";
165             my $owners_
166             = "(?:$vague_year_|$owner_prefix*$owner_initial\\S*)(?:$blank*\\S+)*";
167              
168             # compile regexps in isolation to limit use of RE2 engine
169             my ($dash_spacy_re, $owner_intro_A_re, $boilerplate_X_re,
170             $signs_and_more_re
171             );
172             {
173 7 50   7   11483 BEGIN { re::engine::RE2->import( -strict => 1 ) if ($CAN_RE2) }
174             $dash_spacy_re = qr/$dash_spacy_/;
175             $owner_intro_A_re = qr/^$owner_intro_/;
176             $boilerplate_X_re
177             = qr/(?i)${comma_spacy}All$blank+Rights$blank+Reserved[.!]?.*/;
178             $signs_and_more_re
179             = qr/$chatter|$signs(?:$blank$vague_sign_)?$delimiter(?:$broken_sign_)?(?:$nonyears|((?:$years_$delimiter)?(?:(?:$owner_intro_)?$owners_)?))|\n/;
180             }
181              
182             sub _generate_copyright
183             {
184 13     13   2329 my ( $class, $name, $args, $globals ) = @_;
185              
186             return sub {
187 183     183   47379 my $copyright = shift;
188              
189             Carp::croak("String::Copyright strings require defined parts")
190 183 50       461 unless 1 + @_ == grep {defined} $copyright, @_;
  183         720  
191              
192             # String::Copyright objects are effectively immutable and can be reused
193 183 50 33     897 if ( !@_ && ref($copyright) eq __PACKAGE__ ) {
194 0         0 return $copyright;
195             }
196              
197             # stringify objects
198 183         351 $copyright = "$copyright";
199              
200             # TODO: also parse @_ - but each separately!
201 183         296 my @block;
202 183         294 my $skipped = 0;
203 183         4476 while ( $copyright =~ /$signs_and_more_re/g ) {
204 266         747 my $owners = $1;
205 266 100 66     1036 if ( $globals->{threshold_before} || $globals->{threshold} ) {
206             last
207             if (!@block
208             and !length $owners
209             and ++$skipped >= ( $globals->{threshold_before}
210 24 100 100     80 || $globals->{threshold} ) );
      33        
      100        
211             }
212 264 100 66     853 if ( $globals->{threshold_after} || $globals->{threshold} ) {
213              
214             # "after" detects end of _current_ line so is skewed by one
215             last
216             if (@block
217             and !length $owners
218             and ++$skipped >= 1
219             + ( $globals->{threshold_after} || $globals->{threshold} )
220 14 100 100     63 );
      33        
      100        
221             }
222 262 100       2063 next if ( !length $owners );
223 120         191 $skipped = 0;
224              
225 120         163 my $years;
226 120         1211 my @span = $owners =~ /\G($yearspan_)(?:$comma_spacy|\Z)/gm;
227 120 100       309 if (@span) {
228 47         125 $owners = $';
229              
230             # deduplicate
231 47         78 my @ranges;
232 47         97 for (@span) {
233 74         364 my ( $y1, $y2 ) = split /$dash_spacy_re/;
234 74 100       221 if ( !$y2 ) {
    100          
235 59         157 push @ranges, $y1;
236             }
237             elsif ( $y1 > $y2 ) {
238 1         5 push @ranges, [ $y2, $y1 ];
239             }
240             else {
241 14         54 push @ranges, [ $y1, $y2 ];
242             }
243             }
244              
245             # normalize
246             $years = join ', ',
247 47 100       281 map { $_->[0] == $_->[1] ? $_->[0] : "$_->[0]-$_->[1]" }
  51         5511  
248             Set::IntSpan->new( \@ranges )->spans;
249             }
250 120 100       392 if ($owners) {
251 114         606 $owners =~ s/$owner_intro_A_re//;
252 114         342 $owners =~ s/\s{2,}/ /g;
253 114         370 $owners =~ s/$owner_intro_A_re//;
254 114         369 $owners =~ s/$boilerplate_X_re//g;
255             }
256              
257             # split owner into owner_id and owner
258              
259 120   100     979 push @block, [ $years || undef, $owners || undef ];
      100        
260             }
261              
262             # TODO: save $skipped_lines to indicate how dirty parsing was
263              
264 183         398 my $ext_format = $globals->{format};
265             my $format
266             = $globals->{format}
267 109     109   290 ? sub { $ext_format->( $_->[0], $_->[1] ) }
268 183 100 33 25   922 : sub { join ' ', '©', $_->[0] || (), $_->[1] || () };
  25   33     158  
269              
270 183         1442 bless [ $copyright, \@block, $format ], __PACKAGE__;
271             }
272 13         201 }
273              
274             sub new
275             {
276 0     0 0 0 my ( $self, @data ) = @_;
277             Carp::croak("String::Copyright require defined, positive-length parts")
278 0 0       0 unless 1 + @_ == grep { defined && length } @data;
  0 0       0  
279              
280             # String::Copyright objects are simply stripped of their string part
281 0 0 0     0 if ( !@_ && ref($self) eq __PACKAGE__ ) {
282 0         0 return bless [ undef, $data[1] ], __PACKAGE__;
283             }
284              
285             # FIXME: properly validate data
286             Carp::croak("String::Copyright blocks must be an array of strings")
287 0 0       0 unless @_ == grep { ref eq 'ARRAY' } @data;
  0         0  
288              
289 0         0 bless [ undef, \@data ], __PACKAGE__;
290             }
291              
292             sub _compose
293             {
294 197     197   47042 my $format = $_[0]->[FORMAT];
295 197         356 join "\n", map {&$format} @{ $_[0]->[BLOCKS] };
  134         268  
  197         691  
296             }
297              
298 0     0 0   sub is_normalized { !defined $_[0]->[PLAINTEXT] }
299              
300             =head1 SEE ALSO
301              
302             =over 4
303              
304             =item *
305              
306             L
307              
308             =item *
309              
310             L
311              
312             =back
313              
314             =head1 BUGS/CAVEATS/etc
315              
316             L operates on strings, not bytes.
317             Data encoded as UTF-8, Latin1 or other formats
318             need to be decoded to strings before use.
319              
320             Only ASCII characters and B<©> (copyright sign) are directly processed.
321              
322             If copyright sign is mis-detected
323             or accents or multi-byte characters display wrong,
324             then most likely the data was not decoded into a string.
325              
326             If ranges or lists of years are not tidied,
327             then maybe it contained non-ASCII whitespace or digits.
328              
329             =head1 AUTHOR
330              
331             Jonas Smedegaard C<< >>
332              
333             =head1 COPYRIGHT AND LICENSE
334              
335             This program is based on the script "licensecheck" from the KDE SDK,
336             originally introduced by Stefan Westerfeld C<< >>.
337              
338             Copyright © 2007, 2008 Adam D. Barratt
339              
340             Copyright © 2005-2012, 2016, 2018, 2020-2021 Jonas Smedegaard
341              
342             Copyright © 2018, 2020-2021 Purism SPC
343              
344             This program is free software:
345             you can redistribute it and/or modify it
346             under the terms of the GNU Affero General Public License
347             as published by the Free Software Foundation,
348             either version 3, or (at your option) any later version.
349              
350             This program is distributed in the hope that it will be useful,
351             but WITHOUT ANY WARRANTY;
352             without even the implied warranty
353             of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
354             See the GNU Affero General Public License for more details.
355              
356             You should have received a copy
357             of the GNU Affero General Public License along with this program.
358             If not, see .
359              
360             =cut
361              
362             1;