File Coverage

blib/lib/Text/Perfide/BookSync.pm

Criterion	Covered	Total	%
statement	134	269	49.8
branch	20	78	25.6
condition	0	20	0.0
subroutine	13	19	68.4
pod	8	8	100.0
total	175	394	44.4

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Perfide::BookSync;
2
3	7			7		244974	use 5.006;
	7					31
	7					356
4	7			7		43	use strict;
	7					11
	7					262
5	7			7		39	use warnings;
	7					17
	7					229
6	7			7		23493	use Data::Dumper;
	7					128000
	7					564
7	7			7		69	use List::Util qw/min/;
	7					15
	7					842
8	7			7		6541	use HTML::Auto qw/matrix h v/;
	7					320352
	7					1236
9	7			7		85	use File::Basename;
	7					15
	7					452
10	7			7		10175	use utf8::all;
	7					715293
	7					60
11
12
13							=head1 NAME
14
15							Text::Perfide::BookSync - Synchronize books in plain text format.
16
17							=head1 VERSION
18
19							Version 0.01_09
20
21							=cut
22
23							our $VERSION = '0.01_09';
24
25	7			7		38362	use base 'Exporter';
	7					19
	7					55847
26							our @EXPORT = (qw/ htmlmatrix
27							marksync
28							splitchunks
29							calchunks
30							populate
31							moreinfosecs
32							moreinfochunks
33							/);
34
35
36							=head1 SYNOPSIS
37
38							Text::Perfide::BookSync performs a structural alignment at section level of
39							books in plain text format. The books have to be previously annotated by
40							Text::Perfide::BookCleaner.
41
42							=head1 EXPORT
43
44							=head1 SUBROUTINES/METHODS
45							=cut
46
47							our($split,$mark,$noclean,$html,$rm,$num,$pfile,$dump);
48							$rm //= 0;
49
50							=head2 htmlmatrix
51
52							Generates an HTML file containing a matrix showing the matches between sections
53							of two books.
54
55							=cut
56
57							sub htmlmatrix{
58	0			0	1	0	my ($chunks,$tabsec,$options) = @_;
59	0					0	my (@lines,@cols);
60	0					0	my ($l,$c)=(0,0);
61	0					0	my $h = scalar @{$tabsec->{left}{secs}};
	0					0
62	0					0	my $w = scalar @{$tabsec->{right}{secs}};
	0					0
63
64	0					0	my $data = [ map { [(undef)x$w] } (undef)x$h ];
	0					0
65	0					0	my $more_info = { left => [], right => [] };
66
67	0					0	my $ccount = 0;
68	0					0	for my $chun (@$chunks){
69	0					0	my $ls = scalar @{$chun->{left}{secs}};
	0					0
70	0					0	my $rs = scalar @{$chun->{right}{secs}};
	0					0
71	0					0	for(my $i=0; $i<$ls; $i++){
72	0					0	push @lines, $tabsec->{left}{secs}[$l+$i]{id};
73	0					0	push @{$more_info->{left}},$tabsec->{left}{secs}[$l+$i]{title};
	0					0
74							}
75	0					0	for(my $j=0; $j<$rs; $j++){
76	0					0	push @cols, $tabsec->{right}{secs}[$c+$j]{id};
77	0					0	push @{$more_info->{right}},$tabsec->{right}{secs}[$c+$j]{title};
	0					0
78							}
79	0					0	my $style;
80	0					0	my $wcmp = _numcmp($chun->{left}{wc},$chun->{right}{wc});
81	0	0	0			0	if ($wcmp < 1.1 && $wcmp > 0.9){ $style = 'background: green'; }
	0	0	0			0
			0
			0
82	0					0	elsif ($wcmp > 1.1 && $wcmp < 1.5 or $wcmp < 0.9 && $wcmp > 0.5){ $style = 'background: yellow'; }
83	0					0	else { $style = 'background: red'; }
84
85	0					0	for(my $i=0; $i<$ls; $i++){
86	0					0	for(my $j=0; $j<$rs; $j++){
87	0					0	$data->[$l+$i][$c+$j] = {
88							v => $ccount,
89							a => { style => $style },
90							more_info => matrix(
91							[], # column titles
92							[$lines[$l+$i],$cols[$c+$j]], # line titles
93							[[ $more_info->{left}[$l+$i]],
94							[$more_info->{right}[$c+$j]]],
95							{ css => ".more_info th.empty { display:none; }\n"},
96							),
97							};
98							}
99							}
100	0					0	$l+=$ls;
101	0					0	$c+=$rs;
102	0					0	$ccount++;
103							}
104	0					0	my ($fileL,$fileR) = (basename($tabsec->{left}{file}),basename($tabsec->{right}{file}));
105
106	0					0	my $dir = $options->{dir};
107	0	0				0	if(defined($dir)) { $dir.= "/"; }
	0					0
108	0					0	else { $dir = ""; }
109
110	0	0				0	open my $html,'>',"$dir$fileL"."_$fileR.html" or die "Could not open file '$dir${fileL}_$fileR.html' for writing!";
111	0					0	my $m = matrix(\@cols,\@lines,$data);
112	0					0	print $html v(h($m));
113							}
114
115							=head2 marksync
116
117							Given two files FILEL and FILER, creates new versions of these files
118							(FILEL.sync and FILER.sync) with synchronization tags marking the
119							points where the texts synchronize.
120
121							=cut
122
123							sub marksync{
124	0			0	1	0	my ($chunks,$tabsec,$fileL,$fileR,$options) = @_;
125	0					0	my ($dirL,$dirR);
126	0					0	my $dir = $options->{dir};
127	0					0	my ($fL,$fR);
128	0	0				0	if(defined($dir)){
129	0					0	$fL = "$dir/".basename($fileL);
130	0					0	$fR = "$dir/".basename($fileR);
131							}
132							else {
133	0					0	$fL = $fileL;
134	0					0	$fR = $fileR;
135							}
136
137	0	0				0	open my $syncl,'>',"$fL.sync" or die "Could not open file '$fileL.sync'";
138	0	0				0	open my $syncr,'>',"$fR.sync" or die "Could not open file '$fileR.sync'";
139
140	0	0				0	push @{$options->{outlist}},"$fL.sync\t$fR.sync" if defined($options->{outfile});
	0					0
141
142							# Print first section mark
143	0		0			0	my $rm = $options->{rm} // 0;
144	0	0				0	print $syncl qq/\n/ unless $rm>0;
145	0	0				0	print $syncr qq/\n/ unless $rm>0;
146
147
148	0	0				0	open my $fhL, '<', $fL or die;
149	0	0				0	open my $fhR, '<', $fR or die;
150	0					0	my $t;
151
152							# For each chunk
153	0					0	for(my $i=$rm; $i < @$chunks; $i++){
154
155	0					0	my ($ltext,$rtext);
156
157							# Get the chunk's start offset and end offset (LEFT)
158	0					0	my $l_start = $chunks->[$i]{left}{start};
159	0					0	my $l_length = $chunks->[$i]{left}{end} - $chunks->[$i]{left}{start};
160							# Get the chunk's text (LEFT)
161	0					0	seek($fhL,0,0);
162	0					0	read($fhL,$ltext,$l_start);
163	0					0	read($fhL,$ltext,$l_length);
164
165							# Get the chunk's start offset and end offset (RIGHT)
166	0					0	my $r_start = $chunks->[$i]{right}{start};
167	0					0	my $r_length = $chunks->[$i]{right}{end} - $chunks->[$i]{right}{start};
168							# Get the chunk's text (REFT)
169	0					0	seek($fhR,0,0);
170	0					0	read($fhR,$rtext,$r_start);
171	0					0	read($fhR,$rtext,$r_length);
172
173							# if both sides start with section mark, put sync mark
174	0	0	0			0	if ($ltext =~ /^\s_sec[^:]:([^_]+)_/ and $rtext =~ /^\s_sec[^:]:([^_]+)_/){
175	0					0	$ltext =~ s/^\s_sec[^:]:([^_]+)_\n?/\n/;
176	0					0	$rtext =~ s/^\s_sec[^:]:([^_]+)_\n?/\n/;
177							}
178
179							# Clean section marks left (unless $noclean is defined)
180	0	0				0	$ltext =~ s/_sec[^:]*:([^_]+)_//g unless $options->{noclean};
181	0	0				0	$rtext =~ s/_sec[^:]*:([^_]+)_//g unless $options->{noclean};
182
183							# Print to .sync file
184	0					0	print $syncl $ltext;
185	0					0	print $syncr $rtext;
186							}
187
188	0					0	print $syncl qq{\n};
189	0					0	print $syncr qq{\n};
190
191	0					0	close $syncl;
192	0					0	close $syncr;
193							}
194
195							=head2 splitchunks
196
197							Given two files FILEL and FILER, splits them by their synchronization points,
198							storing each chunk in a file, where each FILEL.cXX matches FILER.cXX.
199
200							=cut
201
202							sub splitchunks{
203	0			0	1	0	my ($chunks,$fileL,$fileR) = @_;
204	0					0	my $ch=1;
205	0	0				0	open my $fhL, '<', $fileL or die;
206	0	0				0	open my $fhR, '<', $fileR or die;
207	0					0	for my $c (@$chunks){
208	0					0	my $id = sprintf("%.3d",int($ch));
209	0					0	my ($t,$fout);
210
211	0					0	my $l_start = $c->{left}{start};
212	0					0	my $l_length = $c->{left}{end} - $c->{left}{start};
213	0					0	seek($fhL,0,0);
214	0					0	read($fhL,$t,$l_start); # because seek only works in bytes
215	0					0	read($fhL,$t,$l_length);
216	0	0				0	open $fout, '>', "$fileL.c$id" or die;
217	0					0	print $fout $t;
218	0					0	close $fout;
219
220	0					0	my $r_start = $c->{right}{start};
221	0					0	my $r_length = $c->{right}{end} - $c->{right}{start};
222	0					0	seek($fhR,0,0);
223	0					0	read($fhR,$t,$r_start); # because seek only works in bytes
224	0					0	read($fhR,$t,$r_length);
225	0	0				0	open $fout, '>', "$fileR.c$id" or die;
226	0					0	print $fout $t;
227	0					0	close $fout;
228
229	0					0	$ch++;
230							}
231							}
232
233							=head2 calchunks
234
235							Calculates chunks for a given pair of files. A chunk is a set of consecutive
236							sections, which are grouped in order to match the corresponding chunk.
237
238							=cut
239
240							sub calchunks{
241	6			6	1	445	my ($tabsec,$fileL,$fileR,$options) = @_;
242	6					311	my $fL = basename($fileL);
243	6					130	my $fR = basename($fileR);
244
245	6					18	my $dir = $options->{dir};
246	6	50				26	if(defined($dir)) { $dir.= "/"; }
	0					0
247	6					17	else { $dir = ""; }
248
249	6	50				972	open my $secsL,'>',"$dir$fL.secs" or die "Can't open file '$dir$fL.secs' for writing!";
250	6	50				891	open my $secsR,'>',"$dir$fR.secs" or die "Can't open file '$dir$fR.secs' for writing!";
251	6	50				357	if ($options->{num}){ ## Compare only section numbers
252	0					0	map {my $x = $_->{id}; $x =~ s/.*=//; print $secsL $x,"\n"} @{$tabsec->{left}{secs}};
	0					0
	0					0
	0					0
	0					0
253	0					0	map {my $x = $_->{id}; $x =~ s/.*=//; print $secsR $x,"\n"} @{$tabsec->{right}{secs}};
	0					0
	0					0
	0					0
	0					0
254							}
255							else{
256	6					14	map {print $secsL $_->{id},"\n"} @{$tabsec->{left}{secs}};
	119					337
	6					26
257	6					15	map {print $secsR $_->{id},"\n"} @{$tabsec->{right}{secs}};
	190					505
	6					22
258							}
259
260	6					27	my ($l,$r) = (-1,-1);
261	6					31	my $diff_file = "$dir${fL}_$fR.diff";
262	6					43406	qx{diff -y "$dir$fL.secs" "$dir$fR.secs" > '$diff_file'};
263	6					61978	open my $diff,"<", "$diff_file";
264	6					906	my $chunks = [];
265
266	6					494	while(<$diff>){
267	223					826	chomp;
268	223					1215	my @a = split /\t+/;
269	223	100				2321	if ($a[1] =~ /^\s*<$/) {
		100
		100
270	33					42	$l++;
271	33					42	push @{$chunks->[-1]{left}{secs}}, $l;
	33					158
272							}
273							elsif ($a[1] =~ /^\s*>$/) {
274	104					237	$r++;
275	104					189	push @{$chunks->[-1]{right}{secs}}, $r;
	104					691
276							}
277							elsif ($a[1] =~ /^\s*[\|]$/) {
278	31					51	$l++;
279	31					44	$r++;
280	31					27	push @{$chunks->[-1]{left}{secs}}, $l;
	31					90
281	31					5663	push @{$chunks->[-1]{right}{secs}}, $r;
	31					131
282							}
283							else{
284	55					93	$l++; $r++;
	55					92
285	55	100				177	if($chunks->[-1]){
286	49					226	$chunks->[-1]{left}{end} = $tabsec->{left}{secs}[$l]{start};
287	49					163	$chunks->[-1]{right}{end} = $tabsec->{right}{secs}[$r]{start};
288							}
289	55					837	push @$chunks, {
290							left => {
291							start => $tabsec->{left}{secs}[$l]{start},
292							secs => [] },
293							right => {
294							start => $tabsec->{right}{secs}[$r]{start},
295							secs => [] }
296							};
297	55					103	push @{$chunks->[-1]{left}{secs}}, $l;
	55					216
298	55					92	push @{$chunks->[-1]{right}{secs}}, $r;
	55					288
299							}
300							}
301
302	6	50				1022	unlink("$fL.secs","$fR.secs",$diff_file) unless defined($options->{dump});
303
304	6					55	$chunks->[-1]{left}{end} = $tabsec->{left}{secs}[-1]{end};
305	6					51	$chunks->[-1]{right}{end} = $tabsec->{right}{secs}[-1]{end};
306							# if(defined($options->{dump})){
307							# open my $cf, '>', "$dir${fL}_$fR.chunks";
308							# print $cf Dumper($chunks);
309							# close $cf;
310							# }
311	6					1983	return $chunks;
312							}
313
314							=head2 populate
315
316							From a given file in which sections have been delimited with
317							Text::Perfide::BookCleaner, creates and returns a list containing information
318							about the sections of this file: id, start offset and end offset.
319
320							=cut
321
322							sub populate{
323	12			12	1	907	my ($file) = shift;
324	12					31	my (@idlist,$text);
325	12	50				914	open my $fh, '<', $file or die "Could not open file '$file'";
326	12					2413	$text = join '',<$fh>;
327
328	12					194067	push @idlist, { 'id' => 'begin', 'start' => 0, title => 'begin' };
329	12					730	while($text =~ /(_sec.:)(.?_)/g){
330	297					600397	$idlist[-1]{end} = $-[0]-1;
331	297					881153	push @idlist,{
332							'id' => $2,
333							'start' => $-[0],
334							};
335
336							# Get title
337	297					646643	my $subs = substr($text,$-[0],200);
338							# if ($subs =~ /.\n(.\|\n){5,100}?(?=\n)/pg) { $idlist[-1]->{title} = ${^MATCH}; }
339	297	100				4374	if ($subs =~ /.*\s+([^\n]{1,100})/g) { $idlist[-1]->{title} = ${^MATCH}; }
	295					10514
340	2					30	else { $idlist[-1]->{title} = '' }
341							}
342	12					44	$idlist[-1]{end} = length $text;
343	12					749	return \@idlist;
344							}
345
346							=head2 moreinfosecs
347
348							Calculates metrics on each pair of sections (length in words, ...)
349
350							=cut
351
352							sub moreinfosecs{
353	6			6	1	528	my ($tabsec,$options) = @_;
354	6					33	my ($fileL,$fileR) = ($tabsec->{left}{file},$tabsec->{right}{file});
355	6					538	open my $fhL, '<', $fileL;
356	6					637	open my $fhR, '<', $fileR;
357	6					272	my $t;
358
359	6					17	for my $sec (@{$tabsec->{left}{secs}}){
	6					29
360	119					1530	my $start = $sec->{start};
361	119					395	my $length = $sec->{end} - $sec->{start};
362	119					8631	seek($fhL,0,0);
363	119					10284	read($fhL,$t,$start); # because seek only works in bytes
364	119					1064811	read($fhL,$t,$length);
365	119					126597	$sec->{wc} = split /\s+/,$t;
366							}
367
368	6					149	for my $sec (@{$tabsec->{right}{secs}}){
	6					38
369	190					651	my $start = $sec->{start};
370	190					890	my $length = $sec->{end} - $sec->{start};
371	190					5810	seek($fhL,0,0);
372	190					9017	read($fhL,$t,$start); # because seek only works in bytes
373	190					1684866	read($fhL,$t,$length);
374	190					79658	$sec->{wc} = split /\s+/,$t;
375							}
376
377	6					27	my $dir = $options->{dir};
378	6	50				44	if(defined($dir)) { $dir.= "/"; }
	0					0
379	6					21	else { $dir = ""; }
380	6					630	my ($fL,$fR) = (basename($tabsec->{left}{file}), basename($tabsec->{right}{file}));
381
382	6	50				366	_dump2file($tabsec,"$dir${fL}_$fR.tabsec") if ($options->{dump});
383							}
384
385							=head2 moreinfochunks
386
387							Calculates metrics on each pair of chunks (length in words, ...)
388
389							=cut
390
391							sub moreinfochunks{
392	6			6	1	288	my ($chunks,$tabsec,$options) = @_;
393	6					46	for my $chun (@$chunks){
394	55					75	my $sum;
395	55					87	map { $sum+= $tabsec->{left}{secs}[$_]{wc} } @{$chun->{left}{secs}};
	119					255
	55					117
396	55					114	$chun->{left}{wc} = $sum;
397	55					155	$sum=0;
398	55					84	map { $sum+= $tabsec->{right}{secs}[$_]{wc} } @{$chun->{right}{secs}};
	190					564
	55					104
399	55					135	$chun->{right}{wc} = $sum;
400							}
401	6					38	my $dir = $options->{dir};
402	6	50				46	if(defined($dir)) { $dir.= "/"; }
	0					0
403	6					40	else { $dir = ""; }
404	6					961	my ($fL,$fR) = (basename($tabsec->{left}{file}), basename($tabsec->{right}{file}));
405
406	6	50				64	_dump2file($chunks,"$dir${fL}_$fR.chunks") if ($options->{dump});
407							}
408
409							=head2 load_localrc
410							=cut
411
412							sub load_localrc {
413	0			0	1		my $localrc = shift;
414	0	0					die "Could not find configuration file '$localrc'" unless -e $localrc;
415	0						return do $localrc;
416							}
417
418							sub _dump2file {
419	0			0			my ($ref,$filename) = @_;
420	0	0					open my $fn, '>', $filename or die "Could not open file '$filename' for writing.";
421	0						print $fn Dumper($ref);
422	0						close $fn;
423							}
424
425							sub _numcmp{
426	0			0			my ($a,$b) = @_;
427	0	0					return _numcmp($b,$a) if ($a>$b);
428	0	0	0				return 1 if ($a==0 and $b==0);
429	0	0					return 10 if ($b==0);
430	0						return $a/$b;
431							}
432
433							=head1 AUTHOR
434
435							Andre Santos, C<< >>
436							Jose Joao Almeida, C<< >>
437
438							=head1 BUGS
439
440							Please report any bugs or feature requests to C, or through
441							the web interface at L. I will be notified, and then you'll
442							automatically be notified of progress on your bug as I make changes.
443
444							=head1 ACKNOWLEDGEMENTS
445
446
447							=head1 LICENSE AND COPYRIGHT
448
449							Copyright 2011 Project Natura.
450
451							This program is free software; you can redistribute it and/or modify it
452							under the terms of either: the GNU General Public License as published
453							by the Free Software Foundation; or the Artistic License.
454
455							See http://dev.perl.org/licenses/ for more information.
456
457
458							=cut
459
460							1; # End of Text::Perfide::BookSync