File Coverage

blib/lib/Hailo/Storage.pm
Criterion Covered Total %
statement 10 10 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 14 100.0


line stmt bran cond sub pod time code
1             package Hailo::Storage;
2             BEGIN {
3 1     1   33950 $Hailo::Storage::AUTHORITY = 'cpan:AVAR';
4             }
5             {
6             $Hailo::Storage::VERSION = '0.72';
7             }
8              
9 1     1   34 use 5.010;
  1         3  
  1         41  
10 1     1   846 use Any::Moose;
  1         34277  
  1         7  
11 1     1   589 use Any::Moose 'X::StrictConstructor';
  1         2  
  1         5  
12             use DBI;
13             use Hailo::Storage::Schema;
14              
15             has dbd => (
16             isa => 'Str',
17             is => 'ro',
18             lazy_build => 1,
19             documentation => "The DBD::* driver we're using",
20             );
21              
22             has dbd_options => (
23             isa => 'HashRef',
24             is => 'ro',
25             lazy_build => 1,
26             documentation => 'Options passed as the last argument to DBI->connect()',
27             );
28              
29             sub _build_dbd_options {
30             my ($self) = @_;
31             return {
32             RaiseError => 1
33             };
34             }
35              
36             has dbh => (
37             isa => 'DBI::db',
38             is => 'ro',
39             lazy_build => 1,
40             documentation => 'Our DBD object',
41             );
42              
43             sub _build_dbh {
44             my ($self) = @_;
45             my $dbd_options = $self->dbi_options;
46              
47             return DBI->connect($self->dbi_options);
48             };
49              
50             has dbi_options => (
51             isa => 'ArrayRef',
52             is => 'ro',
53             auto_deref => 1,
54             lazy_build => 1,
55             documentation => 'Options passed to DBI->connect()',
56             );
57              
58             sub _build_dbi_options {
59             my ($self) = @_;
60             my $dbd = $self->dbd;
61             my $dbd_options = $self->dbd_options;
62             my $db = $self->brain // '';
63              
64             my @options = (
65             "dbi:$dbd:dbname=$db",
66             '',
67             '',
68             $dbd_options,
69             );
70              
71             return \@options;
72             }
73              
74             has _engaged => (
75             isa => 'Bool',
76             is => 'rw',
77             default => 0,
78             documentation => 'Have we done setup work to get this database going?',
79             );
80              
81             has sth => (
82             isa => 'HashRef',
83             is => 'ro',
84             lazy_build => 1,
85             documentation => 'A HashRef of prepared DBI statement handles',
86             );
87              
88             sub _build_sth {
89             my ($self) = @_;
90             return Hailo::Storage::Schema->sth($self->dbd, $self->dbh, $self->order);
91             }
92              
93             has _boundary_token_id => (
94             isa => 'Int',
95             is => 'rw',
96             );
97              
98             # bootstrap the database
99             sub _engage {
100             my ($self) = @_;
101              
102             if ($self->initialized) {
103             # Check the order we've been given and retrieve it from the
104             # database if there's nothing odd going on.
105             $self->_engage_initialized_check_and_set_order;
106              
107             # Likewise for the Tokenizer
108             $self->_engage_initialized_check_and_set_tokenizer;
109              
110             $self->sth->{token_id}->execute(0, '');
111             my $id = $self->sth->{token_id}->fetchrow_array;
112             $self->_boundary_token_id($id);
113             }
114             else {
115             Hailo::Storage::Schema->deploy($self->dbd, $self->dbh, $self->order);
116              
117             # Set metadata in the database for use by subsequent
118             # invocations
119             {
120             # Don't change order again
121             my $order = $self->order;
122             $self->sth->{set_info}->execute('markov_order', $order);
123              
124             # Warn if the tokenizer changes
125             my $tokenizer = $self->tokenizer_class;
126             $self->sth->{set_info}->execute('tokenizer_class', $tokenizer);
127             }
128              
129             $self->sth->{add_token}->execute(0, '');
130             $self->sth->{last_token_rowid}->execute();
131             my $id = $self->sth->{last_token_rowid}->fetchrow_array();
132             $self->_boundary_token_id($id);
133             }
134              
135             $self->_engaged(1);
136              
137             return;
138             }
139              
140             sub _engage_initialized_check_and_set_order {
141             my ($self) = @_;
142              
143             my $sth = $self->dbh->prepare(qq[SELECT text FROM info WHERE attribute = ?;]);
144             $sth->execute('markov_order');
145             my $db_order = $sth->fetchrow_array();
146              
147             my $my_order = $self->order;
148             if ($my_order != $db_order) {
149             if ($self->hailo->{has_custom_order}->()) {
150             die <<"DIE";
151             You've manually supplied an order of `$my_order' to Hailo but you're
152             loading a brain that has the order `$db_order'.
153              
154             Hailo will automatically load the order from existing brains, however
155             you've constructed Hailo and manually specified an order not
156             equivalent to the existing order of the database.
157              
158             Either supply the correct order or omit the order attribute
159             altogether. We could continue but I'd rather die since you're probably
160             expecting something I can't deliver.
161             DIE
162             }
163              
164             $self->order($db_order);
165             $self->hailo->{set_order}->($db_order);
166             }
167              
168             return;
169             }
170              
171             sub _engage_initialized_check_and_set_tokenizer {
172             my ($self) = @_;
173              
174             my $sth = $self->dbh->prepare(qq[SELECT text FROM info WHERE attribute = ?;]);
175             $sth->execute('tokenizer_class');
176             my $db_tokenizer_class = $sth->fetchrow_array;
177             my $my_tokenizer_class = $self->tokenizer_class;
178              
179             # defined() because we can't count on old brains having this
180             if (defined $db_tokenizer_class
181             and $my_tokenizer_class ne $db_tokenizer_class) {
182             if ($self->hailo->{has_custom_tokenizer_class}->()) {
183             die <<"DIE";
184             You've manually supplied a tokenizer class `$my_tokenizer_class' to
185             Hailo, but you're loading a brain that has the tokenizer class
186             `$db_tokenizer_class'.
187              
188             Hailo will automatically load the tokenizer class from existing
189             brains, however you've constructed Hailo and manually specified an
190             tokenizer class not equivalent to the existing tokenizer class of the
191             database.
192              
193             Either supply the correct tokenizer class or omit the order attribute
194             altogether. We could continue but I'd rather die since you're probably
195             expecting something I can't deliver.
196             DIE
197             }
198              
199             $self->tokenizer_class($db_tokenizer_class);
200             $self->hailo->{set_tokenizer_class}->($db_tokenizer_class);
201             }
202              
203             return;
204             }
205              
206             sub start_training {
207             my ($self) = @_;
208             $self->_engage() unless $self->_engaged;
209             $self->start_learning();
210             return;
211             }
212              
213             sub stop_training {
214             my ($self) = @_;
215             $self->stop_learning();
216             return;
217             }
218              
219             sub start_learning {
220             my ($self) = @_;
221             $self->_engage() unless $self->_engaged;
222              
223             # start a transaction
224             $self->dbh->begin_work;
225             return;
226             }
227              
228             sub stop_learning {
229             my ($self) = @_;
230             # finish a transaction
231             $self->dbh->commit;
232             return;
233             }
234              
235             # See if SELECT count(*) FROM info; fails. If not we assume that we
236             # have an up and running database.
237             sub initialized {
238             my ($self) = @_;
239             my $dbh = $self->dbh;
240              
241             my ($err, $warn, $res);
242             eval {
243             # SQLite will warn 'no such table info'
244             local $SIG{__WARN__} = sub { $err = $_[0] };
245              
246             # If it doesn't warn trust that it dies here
247             local ($@, $!);
248             $res = $dbh->do("SELECT count(*) FROM info;");
249             };
250              
251             return (not $err and not $warn and defined $res);
252             }
253              
254             # return some statistics
255             sub totals {
256             my ($self) = @_;
257             $self->_engage() unless $self->_engaged;
258              
259             $self->sth->{token_total}->execute();
260             my $token = $self->sth->{token_total}->fetchrow_array - 1;
261             $self->sth->{expr_total}->execute();
262             my $expr = $self->sth->{expr_total}->fetchrow_array // 0;
263             $self->sth->{prev_total}->execute();
264             my $prev = $self->sth->{prev_total}->fetchrow_array // 0;
265             $self->sth->{next_total}->execute();
266             my $next = $self->sth->{next_total}->fetchrow_array // 0;
267              
268             return $token, $expr, $prev, $next;
269             }
270              
271             __PACKAGE__->meta->make_immutable;
272              
273             =encoding utf8
274              
275             =head1 NAME
276              
277             Hailo::Storage - A base class for L L backends
278              
279             =head1 METHODS
280              
281             The following two methods must to be implemented by subclasses:
282              
283             =head2 C<_build_dbd>
284              
285             Should return the name of the database driver (e.g. 'SQLite') which will be
286             passed to L.
287              
288             =head2 C<_build_dbd_options>
289              
290             Subclasses can override this method to add options of their own. E.g:
291              
292             override _build_dbd_options => sub {
293             return {
294             %{ super() },
295             sqlite_unicode => 1,
296             };
297             };
298              
299             =head1 Comparison of backends
300              
301             This benchmark shows how the backends compare when training on the
302             small testsuite dataset as reported by the F
303             utility (found in the distribution):
304              
305             Rate DBD::Pg DBD::mysql DBD::SQLite/file DBD::SQLite/memory
306             DBD::Pg 2.22/s -- -33% -49% -56%
307             DBD::mysql 3.33/s 50% -- -23% -33%
308             DBD::SQLite/file 4.35/s 96% 30% -- -13%
309             DBD::SQLite/memory 5.00/s 125% 50% 15% --
310              
311             Under real-world workloads SQLite is much faster than these results
312             indicate since the time it takes to train/reply is relative to the
313             existing database size. Here's how long it took to train on a 214,710
314             line IRC log on a Linode 1080 with Hailo 0.18:
315              
316             =over
317              
318             =item * SQLite
319              
320             real 8m38.285s
321             user 8m30.831s
322             sys 0m1.175s
323              
324             =item * MySQL
325              
326             real 48m30.334s
327             user 8m25.414s
328             sys 4m38.175s
329              
330             =item * PostgreSQL
331              
332             real 216m38.906s
333             user 11m13.474s
334             sys 4m35.509s
335              
336             =back
337              
338             In the case of PostgreSQL it's actually much faster to first train
339             with SQLite, dump that database and then import it with L,
340             see L for how to do
341             that.
342              
343             However, replying with an existing database (using
344             F) yields different results. SQLite can
345             reply really quickly without being warmed up (which is the typical
346             usecase for chatbots) but once PostgreSQL and MySQL are warmed up they
347             start replying faster:
348              
349             Here's a comparison of doing 10 replies:
350              
351             Rate PostgreSQL MySQL SQLite-file SQLite-file-28MB SQLite-memory
352             PostgreSQL 71.4/s -- -14% -14% -29% -50%
353             MySQL 83.3/s 17% -- 0% -17% -42%
354             SQLite-file 83.3/s 17% 0% -- -17% -42%
355             SQLite-file-28MB 100.0/s 40% 20% 20% -- -30%
356             SQLite-memory 143/s 100% 71% 71% 43% --
357              
358             In this test MySQL uses around 28MB of memory (using Debian's
359             F) and PostgreSQL around 34MB. Plain SQLite uses 2MB of
360             cache but it's also tested with 28MB of cache as well as with the
361             entire database in memory.
362              
363             But doing 10,000 replies is very different:
364              
365             Rate SQLite-file PostgreSQL SQLite-file-28MB MySQL SQLite-memory
366             SQLite-file 85.1/s -- -7% -18% -27% -38%
367             PostgreSQL 91.4/s 7% -- -12% -21% -33%
368             SQLite-file-28MB 103/s 21% 13% -- -11% -25%
369             MySQL 116/s 37% 27% 13% -- -15%
370             SQLite-memory 137/s 61% 50% 33% 18% --
371              
372             Once MySQL gets more memory (using Debian's F) and a
373             chance to warm it starts yielding better results (I couldn't find out
374             how to make PostgreSQL take as much memory as it wanted):
375              
376             Rate MySQL SQLite-memory
377             MySQL 121/s -- -12%
378             SQLite-memory 138/s 14% --
379              
380             =head1 AUTHOR
381              
382             Evar ArnfjErE Bjarmason
383              
384             Hinrik Ern SigurEsson, hinrik.sig@gmail.com
385              
386             =head1 LICENSE AND COPYRIGHT
387              
388             Copyright 2010 Evar ArnfjErE Bjarmason and
389             Hinrik Ern SigurEsson
390              
391             This program is free software, you can redistribute it and/or modify
392             it under the same terms as Perl itself.
393              
394             =cut