line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::BookBot::Chinese; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
755
|
use 5.008; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
38
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
5
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
6
|
1
|
|
|
1
|
|
5
|
no warnings qw(uninitialized); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
48
|
|
7
|
1
|
|
|
1
|
|
5
|
use base qw(WWW::BookBot); |
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
808
|
|
8
|
1
|
|
|
1
|
|
13
|
use vars qw($VERSION); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
764
|
|
9
|
|
|
|
|
|
|
$VERSION = '0.12'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
#------------------------------------------------------------- |
12
|
|
|
|
|
|
|
# Default settings |
13
|
|
|
|
|
|
|
# $class->default_settings => \%settings |
14
|
|
|
|
|
|
|
#------------------------------------------------------------- |
15
|
|
|
|
|
|
|
sub default_settings { |
16
|
7
|
|
|
7
|
0
|
46
|
my $self = shift->SUPER::default_settings; |
17
|
7
|
|
|
|
|
19
|
$self->{get_language}='zh-cn'; |
18
|
7
|
|
|
|
|
13
|
$self->{language_decode}='gbk'; |
19
|
7
|
|
|
|
|
14
|
$self->{language_encode}='gbk'; |
20
|
7
|
|
|
|
|
17
|
$self; |
21
|
|
|
|
|
|
|
} |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
#------------------------------------------------------------- |
24
|
|
|
|
|
|
|
# Redefined functions |
25
|
|
|
|
|
|
|
# $bot->decode_entity($content_dein_deout) => N/A |
26
|
|
|
|
|
|
|
# $bot->trandict_init => $bot->{translate_dict} |
27
|
|
|
|
|
|
|
# $bot->msg_init => $bot->{messages} |
28
|
|
|
|
|
|
|
#------------------------------------------------------------- |
29
|
|
|
|
|
|
|
sub decode_entity { |
30
|
|
|
|
|
|
|
#chinese novels sometimes add \x{FF1B} after unkown unicode string |
31
|
8
|
|
|
8
|
0
|
23
|
$_[1]=~s/(?:&\#(\d{1,5});?\x{FF1B}?)/chr($1)/esg; |
|
2
|
|
|
|
|
10
|
|
32
|
8
|
|
|
|
|
15
|
$_[1]=~s/(?:&\#[xX]([0-9a-fA-F]{1,5});?\x{FF1B}?)/chr(hex($1))/esg; |
|
0
|
|
|
|
|
0
|
|
33
|
8
|
50
|
|
|
|
16
|
$_[1]=~s/(&([0-9a-zA-Z]{1,9});?)/$WWW::BookBot::entity2char{$2} or $1/esg; |
|
1
|
|
|
|
|
9
|
|
34
|
|
|
|
|
|
|
#normalize middle dot |
35
|
8
|
|
|
|
|
21
|
$_[1]=~s/\x{2022}/\x{00B7}/sg; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
sub trandict_init { |
38
|
7
|
|
|
7
|
0
|
57
|
shift->{translate_dict} = { |
39
|
|
|
|
|
|
|
'log' => "日志", |
40
|
|
|
|
|
|
|
'result' => "结果", |
41
|
|
|
|
|
|
|
'DB' => "数据", |
42
|
|
|
|
|
|
|
'debug' => "调试", |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
sub msg_init { |
46
|
7
|
|
|
7
|
0
|
18
|
my $skip_info="\n".'$pargs->{levelspace} url=$pargs->{url}'."\n"; |
47
|
7
|
|
|
|
|
259
|
shift->{messages} = { |
48
|
|
|
|
|
|
|
TestMsg => '测试: $pargs->{TestInfo} $pargs->{TestNum}', |
49
|
|
|
|
|
|
|
BookStart => '$pargs->{levelspace} [$pargs->{bpos_limit}/$pargs->{book_num}] $pargs->{title_limit} ', |
50
|
|
|
|
|
|
|
BookBinaryOK => '$pargs->{data_len_KB} $pargs->{write_file}'."\n", |
51
|
|
|
|
|
|
|
BookChapterErr => ' - 无法分析'.$skip_info, |
52
|
|
|
|
|
|
|
BookChapterMany => '[$pargs->{chapter_num_limit}章]', |
53
|
|
|
|
|
|
|
BookChapterOne => '[单章节]', |
54
|
|
|
|
|
|
|
BookChapterOK => '$pargs->{data_len_KB}'."\n", |
55
|
|
|
|
|
|
|
BookTOCFinish => '$pargs->{TOC_len_KB}'."\n", |
56
|
|
|
|
|
|
|
CatalogInfo => '取书目: ', |
57
|
|
|
|
|
|
|
CatalogResultErr=> ' 0套书'."\n", |
58
|
|
|
|
|
|
|
CatalogResultOK => ' $pargs->{book_num}套书'."\n", |
59
|
|
|
|
|
|
|
CatalogURL => '$pargs->{url}', |
60
|
|
|
|
|
|
|
CatalogURLEmpty => '[失败] 索引的URL为空'."\n", |
61
|
|
|
|
|
|
|
DBBookErr => "\t".' \$bot->go_book({$pargs->{allargs}});'."\t#错误\n", |
62
|
|
|
|
|
|
|
DBBookOK => "\t".'#\$bot->go_book({$pargs->{allargs}});'."\n", |
63
|
|
|
|
|
|
|
DBCatalogErr => ' \$bot->go_catalog({$pargs->{allargs}});'."\t#错误\n", |
64
|
|
|
|
|
|
|
DBCatalogOK => '#\$bot->go_catalog({$pargs->{allargs}});'."\n", |
65
|
|
|
|
|
|
|
DBHead => <<'DATA', |
66
|
|
|
|
|
|
|
#!$pargs->{perlcmd} |
67
|
|
|
|
|
|
|
##====================================== |
68
|
|
|
|
|
|
|
## 自动生成的数据文件,用于$pargs->{classname} |
69
|
|
|
|
|
|
|
## 生成时间: $pargs->{createtime} |
70
|
|
|
|
|
|
|
##====================================== |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
use $pargs->{classname}; |
73
|
|
|
|
|
|
|
my \$bot = new $pargs->{classname}; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
DATA |
76
|
|
|
|
|
|
|
FailClearDB => '无法清除数据文件$pargs->{filename}: $pargs->{errmsg}', |
77
|
|
|
|
|
|
|
FailClose => '无法关闭$self->{translate_dict}->{$pargs->{filetype}}文件$pargs->{filename}: $pargs->{errmsg}', |
78
|
|
|
|
|
|
|
FailMkDir => '建目录$pargs->{dir}失败: $pargs->{errmsg}', |
79
|
|
|
|
|
|
|
FailOpen => '无法打开$self->{translate_dict}->{$pargs->{filetype}}文件$pargs->{filename}: $pargs->{errmsg}', |
80
|
|
|
|
|
|
|
FailWrite => '无法写入$self->{translate_dict}->{$pargs->{filetype}}文件$pargs->{filename}: $pargs->{errmsg}', |
81
|
|
|
|
|
|
|
GetFail404 => <<'DATA', |
82
|
|
|
|
|
|
|
[$pargs->{code},失败] 找不到文件 |
83
|
|
|
|
|
|
|
$pargs->{url_real} |
84
|
|
|
|
|
|
|
DATA |
85
|
|
|
|
|
|
|
GetFail404Detail=> <<'DATA', |
86
|
|
|
|
|
|
|
[$pargs->{code},失败] 找不到文件 |
87
|
|
|
|
|
|
|
>>>>请求 |
88
|
|
|
|
|
|
|
$pargs->{req_content}<<<<响应 |
89
|
|
|
|
|
|
|
$pargs->{status_line} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
DATA |
92
|
|
|
|
|
|
|
GetFailRetries => <<'DATA', |
93
|
|
|
|
|
|
|
[$pargs->{code},失败] 重试太多,放弃 |
94
|
|
|
|
|
|
|
$pargs->{url_real} |
95
|
|
|
|
|
|
|
DATA |
96
|
|
|
|
|
|
|
GetFailRetriesDetail => <<'DATA', |
97
|
|
|
|
|
|
|
[$pargs->{code},失败] 重试太多,放弃 |
98
|
|
|
|
|
|
|
>>>>请求 |
99
|
|
|
|
|
|
|
$pargs->{req_content}<<<<响应 |
100
|
|
|
|
|
|
|
$pargs->{status_line} |
101
|
|
|
|
|
|
|
$pargs->{res_content} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
DATA |
104
|
|
|
|
|
|
|
GetURLSuccess => '$pargs->{len_KB} ', |
105
|
|
|
|
|
|
|
GetURLRetry => '[$pargs->{code},重试] ', |
106
|
|
|
|
|
|
|
GetWait => '等待..', |
107
|
|
|
|
|
|
|
SkipMaxLevel => '[跳过]层数>$self->{book_max_levels}'.$skip_info, |
108
|
|
|
|
|
|
|
SkipMedia => '[跳过]媒体文件'.$skip_info, |
109
|
|
|
|
|
|
|
SkipTitleEmpty => '[跳过]标题为空'.$skip_info, |
110
|
|
|
|
|
|
|
SkipUrlEmpty => '[跳过]地址为空'."\n", |
111
|
|
|
|
|
|
|
SkipVisited => '[跳过]已访问过'."\n", |
112
|
|
|
|
|
|
|
SkipZip => '[跳过]压缩文件'.$skip_info, |
113
|
|
|
|
|
|
|
}; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
#------------------------------------------------------------- |
117
|
|
|
|
|
|
|
# patterns |
118
|
|
|
|
|
|
|
#------------------------------------------------------------- |
119
|
|
|
|
|
|
|
sub getpattern_space2_data { |
120
|
7
|
|
|
7
|
0
|
72
|
<<'DATA'; |
121
|
|
|
|
|
|
|
[ ] |
122
|
|
|
|
|
|
|
DATA |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
sub getpattern_line_head_data { |
125
|
7
|
|
|
7
|
0
|
23
|
' '; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
sub getpattern_parentheses_data { |
128
|
7
|
|
|
7
|
0
|
37
|
shift->SUPER::getpattern_parentheses_data().<<'DATA'; |
129
|
|
|
|
|
|
|
〃 〃 |
130
|
|
|
|
|
|
|
‘ ’ |
131
|
|
|
|
|
|
|
“ ” |
132
|
|
|
|
|
|
|
〔 〕 |
133
|
|
|
|
|
|
|
〈 〉 |
134
|
|
|
|
|
|
|
《 》 |
135
|
|
|
|
|
|
|
「 」 |
136
|
|
|
|
|
|
|
『 』 |
137
|
|
|
|
|
|
|
〖 〗 |
138
|
|
|
|
|
|
|
【 】 |
139
|
|
|
|
|
|
|
′ ′ |
140
|
|
|
|
|
|
|
″ ″ |
141
|
|
|
|
|
|
|
" " |
142
|
|
|
|
|
|
|
' ' |
143
|
|
|
|
|
|
|
( ) |
144
|
|
|
|
|
|
|
< > |
145
|
|
|
|
|
|
|
[ ] |
146
|
|
|
|
|
|
|
` ` |
147
|
|
|
|
|
|
|
` ' |
148
|
|
|
|
|
|
|
{ } |
149
|
|
|
|
|
|
|
︵ ︶ |
150
|
|
|
|
|
|
|
︹ ︺ |
151
|
|
|
|
|
|
|
︿ ﹀ |
152
|
|
|
|
|
|
|
︽ ︾ |
153
|
|
|
|
|
|
|
﹁ ﹂ |
154
|
|
|
|
|
|
|
﹃ ﹄ |
155
|
|
|
|
|
|
|
︻ ︼ |
156
|
|
|
|
|
|
|
︷ ︸ |
157
|
|
|
|
|
|
|
ˋ ˊ |
158
|
|
|
|
|
|
|
‵ ‵ |
159
|
|
|
|
|
|
|
〝 〞 |
160
|
|
|
|
|
|
|
﹙ ﹚ |
161
|
|
|
|
|
|
|
﹛ ﹜ |
162
|
|
|
|
|
|
|
﹝ ﹞ |
163
|
|
|
|
|
|
|
﹤ ﹥ |
164
|
|
|
|
|
|
|
DATA |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
sub getpattern_mark_dash_data { |
167
|
7
|
|
|
7
|
0
|
20
|
<<'DATA'; |
168
|
|
|
|
|
|
|
[#-&\*\+\-=@_~ˉ—~‖…×÷∷⊙≡≈∽∞$¤¢‰§#%&*+-=@_|–―‥∣¦‐ー─-♂〇〓※︱-︴﹉-﹏﹡﹢﹣﹦﹩﹪﹫] |
169
|
|
|
|
|
|
|
DATA |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
sub getpattern_mark_wordsplit_data { |
172
|
7
|
|
|
7
|
0
|
24
|
<<'DATA'; |
173
|
|
|
|
|
|
|
[\.\,\?\!\:\;∶、。·!,.:;?︰﹐﹑﹒﹔﹕﹖﹗] |
174
|
|
|
|
|
|
|
DATA |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
sub getpattern_word_finish_data { |
177
|
7
|
|
|
7
|
0
|
22
|
<<'DATA'; |
178
|
|
|
|
|
|
|
(?:全[文书]|)[完终] |
179
|
|
|
|
|
|
|
DATA |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
sub getpattern_remove_line_by_end_data { |
182
|
7
|
|
|
7
|
0
|
23
|
<<'DATA'; |
183
|
|
|
|
|
|
|
(case) |
184
|
|
|
|
|
|
|
[报网社讯] |
185
|
|
|
|
|
|
|
[连重排整出提推扫校较编书世视文科在讨小工转][学幻论作]?(?:[载贴排版理品供出入校较描正对者屋库城路界苑线区组室]|海洋|望远镜|桃花源|-K12)(?:完成|) |
186
|
|
|
|
|
|
|
请(?:申请授权|保留站台信息)[。.﹒\.!﹗]? |
187
|
|
|
|
|
|
|
制作 |
188
|
|
|
|
|
|
|
[OoOo][CcCc][RrRr] |
189
|
|
|
|
|
|
|
采编中心 |
190
|
|
|
|
|
|
|
亦凡公益图书馆 |
191
|
|
|
|
|
|
|
龙的天空 |
192
|
|
|
|
|
|
|
失落的星辰 |
193
|
|
|
|
|
|
|
书香门第 |
194
|
|
|
|
|
|
|
旧雨楼 |
195
|
|
|
|
|
|
|
一剑小天下 |
196
|
|
|
|
|
|
|
竹露荷风 |
197
|
|
|
|
|
|
|
扬剑轩居士 |
198
|
|
|
|
|
|
|
幻想时代 |
199
|
|
|
|
|
|
|
冒险者天堂 |
200
|
|
|
|
|
|
|
信息中心 |
201
|
|
|
|
|
|
|
cnread[\.。.·﹒]net |
202
|
|
|
|
|
|
|
ezla[\.。.·﹒]com?[\.。.·﹒]tw |
203
|
|
|
|
|
|
|
thebook[\.。.·﹒]yeah[\.。.·﹒]net |
204
|
|
|
|
|
|
|
y(?:esho[\.。.·﹒]com/wenxue|uzispy[\.。.·﹒]yeah[\.。.·﹒]net) |
205
|
|
|
|
|
|
|
www[\.。.·﹒](?:v-war|oldrain)[\.。.·﹒](?:net|com) |
206
|
|
|
|
|
|
|
DATA |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
sub getpattern_remove_line_by_end_special_data { |
209
|
7
|
|
|
7
|
0
|
26
|
<<'DATA'; |
210
|
|
|
|
|
|
|
报网社讯 |
211
|
|
|
|
|
|
|
DATA |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
1; |
215
|
|
|
|
|
|
|
__END__ |