| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Document::DDCTabs.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, source document, DDC tab-dump |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Document::DDCTabs; |
|
7
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Document; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
30
|
|
|
8
|
1
|
|
|
1
|
|
8
|
use IO::File; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
57
|
|
|
9
|
1
|
|
|
1
|
|
268
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
1220
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
##============================================================================== |
|
12
|
|
|
|
|
|
|
## Globals & Constants |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Document); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
##============================================================================== |
|
17
|
|
|
|
|
|
|
## Constructors etc. |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
## $doc = CLASS_OR_OBJECT->new(%args) |
|
20
|
|
|
|
|
|
|
## + %args, object structure: |
|
21
|
|
|
|
|
|
|
## ( |
|
22
|
|
|
|
|
|
|
## ##-- parsing options |
|
23
|
|
|
|
|
|
|
## eosre => $re, ##-- EOS regex (empty or undef for file-breaks only; default='^$') |
|
24
|
|
|
|
|
|
|
## utf8 => $bool, ##-- enable utf8 parsing? (default=1) |
|
25
|
|
|
|
|
|
|
## trimPND => $bool, ##-- create trimmed "pnd" meta-attribute? (default=1) |
|
26
|
|
|
|
|
|
|
## trimAuthor => $bool, ##-- trim "author" meta-attribute (eliminate DTA PNDs)? (default=1) |
|
27
|
|
|
|
|
|
|
## trimGenre => $bool, ##-- create trimmed "genre" meta-attribute? (default=1) |
|
28
|
|
|
|
|
|
|
## foreign => $bool, ##-- alias for trimAuthor=0 trimPND=0 trimGenre=0 |
|
29
|
|
|
|
|
|
|
## ## |
|
30
|
|
|
|
|
|
|
## ##-- document data |
|
31
|
|
|
|
|
|
|
## date =>$date, ##-- year |
|
32
|
|
|
|
|
|
|
## wf =>$iw, ##-- index-field for $word attribute (default=0) |
|
33
|
|
|
|
|
|
|
## pf =>$ip, ##-- index-field for $pos attribute (default=1) |
|
34
|
|
|
|
|
|
|
## lf =>$il, ##-- index-field for $lemma attribute (default=2) |
|
35
|
|
|
|
|
|
|
## pagef =>$ipage, ##-- index-field for $page attribute (default=undef:none) |
|
36
|
|
|
|
|
|
|
## tokens =>\@tokens, ##-- tokens, including undef for EOS |
|
37
|
|
|
|
|
|
|
## meta =>\%meta, ##-- document metadata (e.g. author, title, collection, ...) |
|
38
|
|
|
|
|
|
|
## ## + may also generate special $meta->{genre} as 1st component of $meta->{textClass} if available |
|
39
|
|
|
|
|
|
|
## ) |
|
40
|
|
|
|
|
|
|
## + each token in @tokens is a HASH-ref {w=>$word,p=>$pos,l=>$lemma,...} |
|
41
|
|
|
|
|
|
|
## + default attribute positions ($iw,$ip,$il,$ipage) are overridden doc lines '%%$DDC:index[INDEX]=LONGNAME w' etc if present |
|
42
|
|
|
|
|
|
|
sub new { |
|
43
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
44
|
0
|
|
|
|
|
|
my $doc = $that->SUPER::new( |
|
45
|
|
|
|
|
|
|
utf8=>1, |
|
46
|
|
|
|
|
|
|
trimPND=>1, |
|
47
|
|
|
|
|
|
|
trimAuthor=>1, |
|
48
|
|
|
|
|
|
|
trimGenre=>1, |
|
49
|
|
|
|
|
|
|
eosre=>qr{^$}, |
|
50
|
|
|
|
|
|
|
wf=>0, |
|
51
|
|
|
|
|
|
|
pf=>1, |
|
52
|
|
|
|
|
|
|
lf=>2, |
|
53
|
|
|
|
|
|
|
pagef=>undef, |
|
54
|
|
|
|
|
|
|
@_, ##-- user arguments |
|
55
|
|
|
|
|
|
|
); |
|
56
|
0
|
|
|
|
|
|
return $doc; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
##============================================================================== |
|
60
|
|
|
|
|
|
|
## API: I/O |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
## $ext = $doc->extension() |
|
63
|
|
|
|
|
|
|
## + default extension, for Corpus::Compiled |
|
64
|
|
|
|
|
|
|
sub extension { |
|
65
|
0
|
|
|
0
|
0
|
|
return '.tabs'; |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
|
69
|
|
|
|
|
|
|
## API: I/O: parse |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
## $bool = $doc->fromFile($filename_or_fh, %opts) |
|
72
|
|
|
|
|
|
|
## + parse tokens from $filename_or_fh |
|
73
|
|
|
|
|
|
|
## + %opts : clobbers %$doc |
|
74
|
|
|
|
|
|
|
sub fromFile { |
|
75
|
0
|
|
|
0
|
1
|
|
my ($doc,$file,%opts) = @_; |
|
76
|
0
|
0
|
|
|
|
|
$doc = $doc->new() if (!ref($doc)); |
|
77
|
0
|
|
|
|
|
|
@$doc{keys %opts} = values %opts; |
|
78
|
0
|
0
|
|
|
|
|
$doc->{label} = ref($file) ? "$file" : $file; |
|
79
|
0
|
0
|
|
|
|
|
my $fh = ref($file) ? $file : IO::File->new("<$file"); |
|
80
|
0
|
0
|
|
|
|
|
$doc->logconfess("fromFile(): cannot open file '$file': $!") if (!ref($fh)); |
|
81
|
0
|
0
|
|
|
|
|
binmode($fh,':utf8') if ($doc->{utf8}); |
|
82
|
|
|
|
|
|
|
|
|
83
|
0
|
|
0
|
|
|
|
my ($wf,$pf,$lf,$pagef) = map {($_//-1)} @$doc{qw(wf pf lf pagef)}; |
|
|
0
|
|
|
|
|
|
|
|
84
|
0
|
|
|
|
|
|
my $tokens = $doc->{tokens}; |
|
85
|
0
|
|
|
|
|
|
@$tokens = qw(); |
|
86
|
0
|
|
|
|
|
|
my $meta = $doc->{meta}; |
|
87
|
0
|
|
|
|
|
|
%$meta = qw(); |
|
88
|
0
|
|
|
|
|
|
my $eos = undef; |
|
89
|
0
|
|
|
|
|
|
my $eosre = $doc->{eosre}; |
|
90
|
0
|
0
|
0
|
|
|
|
$eosre = qr{$eosre} if ($eosre && !ref($eosre)); |
|
91
|
0
|
|
|
|
|
|
my $last_was_eos = 1; |
|
92
|
0
|
|
|
|
|
|
my $is_eos = 0; |
|
93
|
0
|
|
|
|
|
|
my $curpage = ''; |
|
94
|
0
|
|
|
|
|
|
my ($w,$p,$l,$page); |
|
95
|
0
|
|
|
|
|
|
while (defined($_=<$fh>)) { |
|
96
|
0
|
|
|
|
|
|
chomp; |
|
97
|
0
|
0
|
0
|
|
|
|
if (/^%%/) { |
|
|
|
0
|
|
|
|
|
|
|
98
|
0
|
0
|
|
|
|
|
if (/^%%(?:\$DDC:meta\.date_|\$?date)=([0-9]+)/) { |
|
99
|
0
|
|
|
|
|
|
$doc->{date} = $1; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
0
|
0
|
0
|
|
|
|
if (/^%%\$DDC:meta\.([^=]+)=(.*)$/) { |
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
$meta->{$1} = $2; |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Token\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ w$/) { |
|
105
|
0
|
|
|
|
|
|
$wf = $doc->{wf} = $1; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Pos\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ p$/) { |
|
108
|
0
|
|
|
|
|
|
$pf = $doc->{pf} = $1; |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Lemma\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ l$/) { |
|
111
|
0
|
|
|
|
|
|
$lf = $doc->{lf} = $1; |
|
112
|
|
|
|
|
|
|
} |
|
113
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Pos\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ page$/) { |
|
114
|
0
|
|
|
|
|
|
$pagef = $doc->{pagef} = $1; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
elsif (/^%%\$DDC:BREAK.([^=\[\]]+)/) { |
|
117
|
0
|
|
|
|
|
|
push(@$tokens,"#$1"); |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
elsif (/^%%\$DDC:PAGE=/) { |
|
120
|
0
|
|
|
|
|
|
push(@$tokens,"#page"); |
|
121
|
|
|
|
|
|
|
} |
|
122
|
0
|
0
|
0
|
|
|
|
if ($eosre && $_ =~ $eosre) { |
|
123
|
0
|
0
|
|
|
|
|
push(@$tokens,$eos) if (!$last_was_eos); |
|
124
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
0
|
|
|
|
|
|
next; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
elsif ($eosre && $_ =~ $eosre) { |
|
129
|
0
|
0
|
|
|
|
|
push(@$tokens,$eos) if (!$last_was_eos); |
|
130
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
|
131
|
0
|
|
|
|
|
|
next; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
0
|
|
|
|
|
|
($w,$p,$l,$page) = (split(/\t/,$_))[$wf,$pf,$lf,$pagef]; |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
##-- honor dta-style $page index |
|
136
|
0
|
0
|
0
|
|
|
|
if ($pagef > 0 && $page ne $curpage) { |
|
137
|
0
|
|
|
|
|
|
push(@$tokens, "#page"); |
|
138
|
0
|
|
|
|
|
|
$curpage = $page; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
##-- add token |
|
142
|
0
|
|
0
|
|
|
|
push(@$tokens, {w=>($w//''), p=>($p//''), l=>($l//'')}); |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
143
|
0
|
|
|
|
|
|
$last_was_eos = 0; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
0
|
0
|
|
|
|
|
push(@$tokens,$eos) if (!$last_was_eos); |
|
146
|
|
|
|
|
|
|
|
|
147
|
0
|
0
|
|
|
|
|
if (!$doc->{foreign}) { |
|
148
|
|
|
|
|
|
|
##-- hack: compute top-level $meta->{genre} from $meta->{textClass} if requested |
|
149
|
0
|
|
0
|
|
|
|
$meta->{genre} //= $meta->{textClass}; |
|
150
|
|
|
|
|
|
|
$meta->{genre} =~ s/\:.*$// |
|
151
|
0
|
0
|
0
|
|
|
|
if ($doc->{trimGenre} && defined($meta->{genre})); |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
##-- hack: compute/trim top-level $meta->{pnd} if requested |
|
154
|
0
|
|
0
|
|
|
|
$meta->{pnd} //= $meta->{author}; |
|
155
|
0
|
0
|
0
|
|
|
|
if ($doc->{trimPND} && defined($meta->{pnd})) { |
|
156
|
0
|
|
|
|
|
|
$meta->{pnd} = join(' ', ($meta->{pnd} =~ m/\#[0-9a-zA-Z]+/g)); |
|
157
|
0
|
0
|
0
|
|
|
|
delete($meta->{pnd}) if (($meta->{pnd}//'') eq ''); |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
##-- hack: trim top-level $meta->{author} if requested |
|
161
|
|
|
|
|
|
|
$meta->{author} =~ s/\s*\([^\)]*\)$// |
|
162
|
0
|
0
|
0
|
|
|
|
if ($doc->{trimAuthor} && defined($meta->{author})); |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
0
|
0
|
|
|
|
|
$fh->close() if (!ref($file)); |
|
166
|
0
|
|
|
|
|
|
return $doc; |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
##============================================================================== |
|
170
|
|
|
|
|
|
|
## Footer |
|
171
|
|
|
|
|
|
|
1; |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
__END__ |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|