File Coverage

blib/lib/Lingua/ZH/Summarize.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             # $File: //member/autrijus/Lingua-ZH-Summarize/Summarize.pm $ $Author: autrijus $
2             # $Revision: #2 $ $Change: 3687 $ $DateTime: 2003/01/20 07:22:40 $
3              
4             package Lingua::ZH::Summarize;
5             $Lingua::ZH::Summarize::VERSION = '0.01';
6              
7 1     1   649 use strict;
  1         2  
  1         44  
8 1     1   5 use vars qw($VERSION @ISA @EXPORT);
  1         3  
  1         65  
9              
10 1     1   5 use Exporter;
  1         2  
  1         42  
11 1     1   744 use Lingua::ZH::Toke;
  0            
  0            
12              
13             =head1 NAME
14              
15             Lingua::ZH::Summarize - Summarizing bodies of Chinese text
16              
17             =head1 SYNOPSIS
18              
19             use Lingua::ZH::Summarize;
20              
21             print summarize( $text ); # Easy, no? :-)
22             print summarize( $text, maxlength => 500 ); # 500-byte summary
23             print summarize( $text, wrap => 75 ); # Wrap output to 75 col.
24              
25             =head1 DESCRIPTION
26              
27             This is a simple module which makes an unscientific effort at
28             summarizing Chinese text. It recognizes simple patterns which look like
29             statements, abridges them, and concatenates them into something vaguely
30             resembling a summary. It needs more work on large bodies of text, but
31             it seems to have a decent effect on small inputs at the moment.
32              
33             Lingua::ZH::Summarize exports one function, C, which takes
34             the text to summarize as its first argument, and any number of optional
35             directives in C value> form. The options it'll take are:
36              
37             =over
38              
39             =item maxlength
40              
41             Specifies the maximum length, in bytes, of the generated summary.
42              
43             =item wrap
44              
45             Prettyprints the summary output by wrapping it to the number of columns
46             which you specify. This requires the L module.
47              
48             =back
49              
50             Needless to say, this is a very simple and not terribly universally
51             effective scheme, but it's good enough for a first draft, and I'll bang
52             on it more later. Like I said, it's not a scientific approach to the
53             problem, but it's better than nothing.
54              
55             =cut
56              
57             @ISA = qw(Exporter);
58             @EXPORT = qw(summarize);
59              
60             my %punct = map { $_ => $_ } qw(。 ? ! ; ...);
61             $punct{$_} = '。' for qw(, -);
62              
63             my %key = map { $_ => 1 } qw(是 會 曾 將 能 有);
64             my %stop = (
65             %key, map { $_ => 1 } qw(
66             的 裡 和 與 及 年 月 日 時 分 秒 可 對 於 但 也 且 或 中 而 為 叫
67             )
68             );
69              
70             my $is_utf8;
71             sub import {
72             my $class = shift;
73              
74             if ($_[0] eq 'utf8' and !$is_utf8++) {
75             Lingua::ZH::Toke->import(@_);
76              
77             require Encode;
78             %key = map { Encode::decode( big5 => $_ ) => 1 } keys %key;
79             %stop = map { Encode::decode( big5 => $_ ) => 1 } keys %stop;
80             %punct = map {
81             Encode::decode( big5 => $_ ) =>
82             Encode::decode( big5 => $punct{$_} )
83             } keys %punct;
84             }
85             }
86              
87             sub summarize {
88             my ($text, %options) = @_;
89              
90             # Run each filter over the text.
91             return unless $text;
92              
93             # Strip whitespace and formatting out of the text.
94             $text =~ s/^\s+//;
95             $text =~ s/\s+/ /sg;
96             $text =~ s/\s+$//;
97              
98             unless (exists $options{maxlength} and $options{maxlength} > 0) {
99             $options{maxlength} = log(length($text)) * 30;
100             }
101              
102             ### Here's where the interesting logic happens.
103              
104             my $sentence = Lingua::ZH::Toke->new($text);
105              
106             # First we break it into sentence pieces. Kind of. Sort of.
107              
108             my $summary = '';
109             my $flag = 1;
110              
111             <$sentence> unless $sentence->[0][0];
112              
113             while (my ($chunk, $punct) = (scalar <$sentence>, scalar <$sentence>)) {
114             ($flag = $punct{$punct}, next) unless $flag;
115             $flag = $punct{$punct} or next;
116              
117             next unless length($chunk) > 10;
118             next unless $chunk =~ /.+(?:\Q是\E|\Q會\E|\Q曾\E|\Q將\E|\Q能\E|\Q有\E)/;
119             next if $stop{substr($chunk, 0, 2)} or $stop{substr($chunk, -2)};
120              
121             $summary .= $chunk . $punct{$punct};
122              
123             last if length($summary) >= $options{maxlength};
124             }
125              
126             ### Done! Do any necessary postprocessing before returning.
127              
128             return $summary unless $options{wrap};
129              
130             # Prettyprint the summary to make it look nice on a terminal, if requested.
131              
132             require Lingua::ZH::Wrap;
133              
134             $summary = Encode::encode(big5 => $summary) if $is_utf8;
135             $summary = Lingua::ZH::Wrap::wrap(
136             $summary, $options{wrap} || 72, 1
137             );
138             $summary = Encode::decode(big5 => $summary) if $is_utf8;
139              
140             return $summary;
141             }
142              
143             1;
144              
145             =head1 SEE ALSO
146              
147             L, L, L
148              
149             =head1 ACKNOWLEDGEMENTS
150              
151             Algorithm adapted from the L module by
152             Dennis Taylor, Edennis@funkplanet.comE.
153              
154             =head1 AUTHORS
155              
156             Autrijus Tang Eautrijus@autrijus.orgE
157              
158             =head1 COPYRIGHT
159              
160             Copyright 2003 by Autrijus Tang Eautrijus@autrijus.orgE.
161              
162             This program is free software; you can redistribute it and/or modify it
163             under the same terms as Perl itself.
164              
165             See L
166              
167             =cut