File Coverage

blib/lib/String/Copyright.pm
Criterion Covered Total %
statement 78 88 88.6
branch 26 38 68.4
condition 25 40 62.5
subroutine 17 19 89.4
pod 0 2 0.0
total 146 187 78.0


line stmt bran cond sub pod time code
1 7     7   1640811 use 5.008001;
  7         62  
2 7     7   41 use strict;
  7         14  
  7         177  
3 7     7   38 use warnings;
  7         16  
  7         191  
4 7     7   37 use utf8;
  7         12  
  7         37  
5 7     7   246 use re (qw/eval/);
  7         15  
  7         629  
6              
7             my $CAN_RE2;
8              
9             BEGIN {
10 7     7   27 eval { require re::engine::RE2 };
  7         890  
11 7 50       772 $CAN_RE2 = $@ ? '' : 1;
12             }
13              
14             package String::Copyright;
15              
16             =encoding UTF-8
17              
18             =head1 NAME
19              
20             String::Copyright - Representation of text-based copyright statements
21              
22             =head1 VERSION
23              
24             Version 0.003013
25              
26             =cut
27              
28             our $VERSION = '0.003013';
29              
30             # Dependencies
31 7     7   2638 use parent 'Exporter::Tiny';
  7         2572  
  7         38  
32 7     7   27452 use Carp ();
  7         30  
  7         157  
33 7     7   3746 use Set::IntSpan;
  7         70709  
  7         572  
34              
35             our @EXPORT = qw/copyright/;
36              
37             use constant {
38 7         603 PLAINTEXT => 0,
39             BLOCKS => 1,
40             FORMAT => 2,
41 7     7   64 };
  7         19  
42              
43             use overload (
44 7         40 q{""} => '_compose',
45             fallback => 1,
46 7     7   45 );
  7         14  
47              
48             =head1 SYNOPSIS
49              
50             use String::Copyright;
51              
52             my $copyright = copyright(<<'END');
53             copr. © 1999,2000 Foo Barbaz and Acme Corp.
54             Copyright (c) 2001,2004 Foo (work address)
55             Copyright 2003, Foo B. and friends
56             © 2000, 2002 Foo Barbaz
57             END
58              
59             print $copyright;
60              
61             # Copyright 1999-2000 Foo Barbaz and Acme Corp.
62             # Copyright 2000, 2002 Foo Barbaz and Acme Corp.
63             # Copyright 2001, 2004 Foo (work address)
64             # Copyright 2003 Foo B. and friends
65              
66             =head1 DESCRIPTION
67              
68             L identifies copyright statements in a string
69             and serializes them in a normalized format.
70              
71             =head1 OPTIONS
72              
73             Options can be set as an argument to the 'use' statement.
74              
75             =head2 threshold, threshold_before, threshold_after
76              
77             use String::Copyright { threshold_after => 5 };
78              
79             Stop parsing after this many lines whithout copyright information,
80             before or after having found any copyright information at all.
81             C sets both C and C.
82              
83             By default unset: All lines are parsed.
84              
85             =head2 format( \&sub )
86              
87             use String::Copyright { format => \&GNU_style } };
88              
89             sub GNU_style {
90             my ( $years, $owners ) = @_;
91              
92             return 'Copyright (C) ' . join ' ', $years || '', $owners || '';
93             }
94              
95             =head1 FUNCTIONS
96              
97             Exports one function: C.
98             This module uses L to export functions,
99             which allows for flexible import options;
100             see the L documentation for details.
101              
102             =cut
103              
104             # OR'ed strings have regular variable name and are already grouped
105             # AND'ed strings have name ending in underscore: must be grouped if repeated
106             my $blank = '[ \t]';
107             my $blank_or_break_ = "$blank*\\n?$blank*";
108             my $dash = '[-˗‐‑‒–—―⁃−﹣-]';
109             my $colons_ = "$blank?:{1,2}";
110             my $label = '(?i:copyright(?:-holders?)?\b|copr\.)';
111             my $sign = '[©⒞Ⓒⓒ🄒🄫🅒]';
112             my $nroff_sign_ = '\\\\[(]co';
113             my $pseudo_sign_ = '[({][Cc][})]';
114             my $vague_sign_ = '-[Cc]-';
115             my $broken_sign_ = "\\?$blank*";
116              
117             # high-bit © noise, caused by misparsing UTF-8 as latin1
118             # except \xAE (latin1 ©), \xAE (MacRoman ©), \xE2 (latin1 © lowercased after misparse)
119             my $nonsign_ = '[\x80-\xAB\xAD-\xC1\xC3-\xE1\xE3-\xFF]\xA9';
120             my $nonidentifier_
121             = "(?:no |_|$dash)copyright|copyright-[^h]|(?:Digital Millennium|U.S.|US|United States) Copyright Act|\\b(?:for|we) copyright\\b";
122              
123             # this should cause *no* false positives, and stop-chars therefore
124             # exclude e.g. email address building blocks; tested against the code
125             # corpus at https://codesearch.debian.net/ (tricky: its RE2 engine lacks
126             # support for negative groups) using searches like these:
127             # (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\w
128             # (?i)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)@\b[-_@]
129             # (?im)copyright (?:(?:claim|holder|info|information|notice|owner|ownership|statement|string)s?|in|is|to)[^ $]
130             my $identifier_action
131             = '(?i:apply|applied|applies|assigned|generated|transfer|transferred)';
132             my $identifier_thing_
133             = '(?i:block|claim|date|disclaimer|holder|info|information|interest|law|license|notice|owner|ownership|permission|sign|statement|string|symbol|tag|text)s?';
134             my $identifier_misc
135             = "(?i:and|are|at|eq|for|if|in|is|of|on|or|,${blank}patent|this|to|the (?:library|software),|treaty)";
136             my $identifier_chatter
137             = "(?:$identifier_action|$identifier_thing_|$identifier_misc)";
138             my $the_notname
139             = '(?i:concrete|fault|first|immediately|least|min\/max|one|outer|previous|ratio|sum|user)';
140             my $the_sentence_
141             = "(?:\\w+$blank+){1,10}(?i:are|can(?:not)?|in|is|must|was)";
142             my $pseudosign_chatter_
143             = "(?:(?:the$blank+(?:$the_notname|$the_sentence_)|all begin|there|you must)\\b|,? \\(?\\w\\))";
144             my $chatter
145             = "(?im:$nonsign_|$nonidentifier_|copyright$blank_or_break_$identifier_chatter(?:\\z|@\\W|[^a-zA-Z0-9@_-])|$blank*$pseudo_sign_(?:$blank_or_break_)+$pseudosign_chatter_)";
146             my $nonyears_ = '\W?(?i:year|19[xy]{2}|[xy]{4})\W?';
147              
148             my $year_ = '\b[0-9]{4}\b';
149             my $comma_spacy = "(?:$blank*,$blank_or_break_|$blank_or_break_,?$blank*)";
150             my $dash_spacy_ = "$blank*$dash(?:$blank_or_break_)*";
151              
152             my $colon_or_dash = "(?:$colons_$blank_or_break_|$blank?$dash\{1,2}$blank)";
153             my $delimiter = "(?:$colon_or_dash|$comma_spacy)";
154              
155             my $vague_year_ = "(?:$dash$blank?)?[0-9]{1,5}";
156             my $owner_intro_
157             = "(?:$colon_or_dash|$pseudo_sign_$blank?|\\bby$blank_or_break_)";
158             my $owner_prefix = '[(*<@\[{]';
159             my $owner_initial = '[^\s!"#$%&\'()*+,./:;<=>?@[\\\\\]^_`{|}~-]';
160              
161             my $signs
162             = "(?m:(?:$label|$sign|$nroff_sign_|(?:^|$blank)$pseudo_sign_)(?:$colon_or_dash?$blank*(?:$label|$sign|$pseudo_sign_))*)";
163             my $yearspan_ = "$year_(?:$dash_spacy_$year_)?";
164             my $years_ = "$yearspan_(?:$comma_spacy$yearspan_)*";
165             my $owners_
166             = "(?:$vague_year_|$owner_prefix*$owner_initial\\S*)(?:$blank*\\S+)*";
167              
168             # compile regexps in isolation to limit use of RE2 engine
169             my ($dash_spacy_re, $owner_intro_A_re, $boilerplate_X_re,
170             $signs_and_more_re
171             );
172             {
173 7 50   7   11374 BEGIN { re::engine::RE2->import( -strict => 1 ) if ($CAN_RE2) }
174             $dash_spacy_re = qr/$dash_spacy_/;
175             $owner_intro_A_re = qr/^$owner_intro_/;
176             $boilerplate_X_re
177             = qr/(?i)${comma_spacy}All$blank+Rights$blank+Reserved[.!]?.*/;
178             $signs_and_more_re
179             = qr/$chatter|$signs(?:$blank$vague_sign_)?$delimiter(?:$broken_sign_)?(?:$nonyears_|((?:$years_$delimiter)?(?:(?:$owner_intro_)?$owners_)?))|\n/;
180             }
181              
182             sub _generate_copyright
183             {
184 13     13   2275 my ( $class, $name, $args, $globals ) = @_;
185              
186             return sub {
187 184     184   46519 my $copyright = shift;
188              
189             Carp::croak("String::Copyright strings require defined parts")
190 184 50       442 unless 1 + @_ == grep {defined} $copyright, @_;
  184         794  
191              
192             # String::Copyright objects are effectively immutable and can be reused
193 184 50 33     874 if ( !@_ && ref($copyright) eq __PACKAGE__ ) {
194 0         0 return $copyright;
195             }
196              
197             # stringify objects
198 184         360 $copyright = "$copyright";
199              
200             # TODO: also parse @_ - but each separately!
201 184         308 my @block;
202 184         298 my $skipped = 0;
203 184         4332 while ( $copyright =~ /$signs_and_more_re/g ) {
204 267         753 my $owners = $1;
205 267 100 66     1025 if ( $globals->{threshold_before} || $globals->{threshold} ) {
206             last
207             if (
208             !@block
209             and !length $owners
210             and ++$skipped >= (
211             $globals->{threshold_before} || $globals->{threshold}
212             )
213 24 100 100     76 );
      33        
      100        
214             }
215 265 100 66     845 if ( $globals->{threshold_after} || $globals->{threshold} ) {
216              
217             # "after" detects end of _current_ line so is skewed by one
218             last
219             if (
220             @block
221             and !length $owners
222             and ++$skipped >= 1 + (
223             $globals->{threshold_after} || $globals->{threshold}
224             )
225 14 100 100     63 );
      33        
      100        
226             }
227 263 100       1989 next if ( !length $owners );
228 120         176 $skipped = 0;
229              
230 120         178 my $years;
231 120         1234 my @span = $owners =~ /\G($yearspan_)(?:$comma_spacy|\Z)/gm;
232 120 100       308 if (@span) {
233 47         127 $owners = $';
234              
235             # deduplicate
236 47         79 my @ranges;
237 47         109 for (@span) {
238 74         363 my ( $y1, $y2 ) = split /$dash_spacy_re/;
239 74 100       222 if ( !$y2 ) {
    100          
240 59         152 push @ranges, $y1;
241             }
242             elsif ( $y1 > $y2 ) {
243 1         6 push @ranges, [ $y2, $y1 ];
244             }
245             else {
246 14         54 push @ranges, [ $y1, $y2 ];
247             }
248             }
249              
250             # normalize
251             $years = join ', ',
252 47 100       272 map { $_->[0] == $_->[1] ? $_->[0] : "$_->[0]-$_->[1]" }
  51         5390  
253             Set::IntSpan->new( \@ranges )->spans;
254             }
255 120 100       390 if ($owners) {
256 114         561 $owners =~ s/$owner_intro_A_re//;
257 114         278 $owners =~ s/\s{2,}/ /g;
258 114         355 $owners =~ s/$owner_intro_A_re//;
259 114         348 $owners =~ s/$boilerplate_X_re//g;
260             }
261              
262             # split owner into owner_id and owner
263              
264 120   100     950 push @block, [ $years || undef, $owners || undef ];
      100        
265             }
266              
267             # TODO: save $skipped_lines to indicate how dirty parsing was
268              
269 184         382 my $ext_format = $globals->{format};
270             my $format
271             = $globals->{format}
272 109     109   345 ? sub { $ext_format->( $_->[0], $_->[1] ) }
273 184 100 33 25   926 : sub { join ' ', '©', $_->[0] || (), $_->[1] || () };
  25   33     130  
274              
275 184         1461 bless [ $copyright, \@block, $format ], __PACKAGE__;
276             }
277 13         159 }
278              
279             sub new
280             {
281 0     0 0 0 my ( $self, @data ) = @_;
282             Carp::croak("String::Copyright require defined, positive-length parts")
283 0 0       0 unless 1 + @_ == grep { defined && length } @data;
  0 0       0  
284              
285             # String::Copyright objects are simply stripped of their string part
286 0 0 0     0 if ( !@_ && ref($self) eq __PACKAGE__ ) {
287 0         0 return bless [ undef, $data[1] ], __PACKAGE__;
288             }
289              
290             # FIXME: properly validate data
291             Carp::croak("String::Copyright blocks must be an array of strings")
292 0 0       0 unless @_ == grep { ref eq 'ARRAY' } @data;
  0         0  
293              
294 0         0 bless [ undef, \@data ], __PACKAGE__;
295             }
296              
297             sub _compose
298             {
299 198     198   46786 my $format = $_[0]->[FORMAT];
300 198         356 join "\n", map {&$format} @{ $_[0]->[BLOCKS] };
  134         273  
  198         690  
301             }
302              
303 0     0 0   sub is_normalized { !defined $_[0]->[PLAINTEXT] }
304              
305             =head1 SEE ALSO
306              
307             =over 4
308              
309             =item *
310              
311             L
312              
313             =item *
314              
315             L
316              
317             =back
318              
319             =head1 BUGS/CAVEATS/etc
320              
321             L operates on strings, not bytes.
322             Data encoded as UTF-8, Latin1 or other formats
323             need to be decoded to strings before use.
324              
325             Only ASCII characters and B<©> (copyright sign) are directly processed.
326              
327             If copyright sign is mis-detected
328             or accents or multi-byte characters display wrong,
329             then most likely the data was not decoded into a string.
330              
331             If ranges or lists of years are not tidied,
332             then maybe it contained non-ASCII whitespace or digits.
333              
334             =head1 AUTHOR
335              
336             Jonas Smedegaard C<< >>
337              
338             =head1 COPYRIGHT AND LICENSE
339              
340             This program is based on the script "licensecheck" from the KDE SDK,
341             originally introduced by Stefan Westerfeld C<< >>.
342              
343             Copyright © 2007, 2008 Adam D. Barratt
344              
345             Copyright © 2005-2012, 2016, 2018, 2020-2021 Jonas Smedegaard
346              
347             Copyright © 2018, 2020-2021 Purism SPC
348              
349             This program is free software:
350             you can redistribute it and/or modify it
351             under the terms of the GNU Affero General Public License
352             as published by the Free Software Foundation,
353             either version 3, or (at your option) any later version.
354              
355             This program is distributed in the hope that it will be useful,
356             but WITHOUT ANY WARRANTY;
357             without even the implied warranty
358             of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
359             See the GNU Affero General Public License for more details.
360              
361             You should have received a copy
362             of the GNU Affero General Public License along with this program.
363             If not, see .
364              
365             =cut
366              
367             1;