File Coverage

udpipe/udpipe.cpp

Criterion	Covered	Total	%
statement	1870	9263	20.1
branch	1707	16906	10.1
condition			n/a
subroutine			n/a
pod			n/a
total	3577	26169	13.6

line	stmt	bran	code
1			// This file is part of UDPipe .
2			//
3			// This file is a bundle of all sources and headers of UDPipe library.
4			// Comments and copyrights of all individual files are kept.
5
6			#include
7			#include
8			#include
9			#include
10			#include
11			#include
12			#include
13			#include
14			#include
15			#include
16			#include
17			#include
18			#include
19			#include
20			#include
21			#include
22			#include
23			#include
24			#include
25			#include
26			#include
27			#include
28			#include
29			#include
30			#include
31			#include
32
33			namespace ufal {
34			namespace udpipe {
35
36			/////////
37			// File: utils/common.h
38			/////////
39
40			// This file is part of UFAL C++ Utils .
41			//
42			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
43			// Mathematics and Physics, Charles University in Prague, Czech Republic.
44			//
45			// This Source Code Form is subject to the terms of the Mozilla Public
46			// License, v. 2.0. If a copy of the MPL was not distributed with this
47			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
48
49			// Headers available in all sources
50
51			namespace utils {
52
53			using namespace std;
54
55			// Assert that int is at least 4B
56			static_assert(sizeof(int) >= sizeof(int32_t), "Int must be at least 4B wide!");
57
58			// Assert that we are on a little endian system
59			#ifdef __BYTE_ORDER__
60			static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Only little endian systems are supported!");
61			#endif
62
63			#define runtime_failure(message) exit((cerr << message << endl, 1))
64
65			} // namespace utils
66
67			/////////
68			// File: utils/string_piece.h
69			/////////
70
71			// This file is part of UFAL C++ Utils .
72			//
73			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
74			// Mathematics and Physics, Charles University in Prague, Czech Republic.
75			//
76			// This Source Code Form is subject to the terms of the Mozilla Public
77			// License, v. 2.0. If a copy of the MPL was not distributed with this
78			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
79
80			namespace utils {
81
82			struct string_piece {
83			const char* str;
84			size_t len;
85
86	16		string_piece() : str(nullptr), len(0) {}
87	42		string_piece(const char* str) : str(str), len(strlen(str)) {}
88	128		string_piece(const char* str, size_t len) : str(str), len(len) {}
89	43		string_piece(const string& str) : str(str.c_str()), len(str.size()) {}
90			};
91
92			inline ostream& operator<<(ostream& os, const string_piece& str) {
93	0	0	return os.write(str.str, str.len);
		0
		0
		0
		0
		0
		0
		0
		0
		0
94			}
95
96			inline bool operator==(const string_piece& a, const string_piece& b) {
97	73	100	return a.len == b.len && memcmp(a.str, b.str, a.len) == 0;
		50
		100
		50
		50
		0
		50
		0
		100
		50
		100
		50
		50
		0
		50
		50
		100
		50
		50
		50
		50
		0
		50
		50
		0
		0
		0
		0
		0
		0
		0
		0
98			}
99
100			inline bool operator!=(const string_piece& a, const string_piece& b) {
101			return a.len != b.len \|\| memcmp(a.str, b.str, a.len) != 0;
102			}
103
104			} // namespace utils
105
106			/////////
107			// File: common.h
108			/////////
109
110			// This file is part of UDPipe .
111			//
112			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
113			// Mathematics and Physics, Charles University in Prague, Czech Republic.
114			//
115			// This Source Code Form is subject to the terms of the Mozilla Public
116			// License, v. 2.0. If a copy of the MPL was not distributed with this
117			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
118
119			using namespace utils;
120
121			/////////
122			// File: sentence/empty_node.h
123			/////////
124
125			// This file is part of UDPipe .
126			//
127			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
128			// Mathematics and Physics, Charles University in Prague, Czech Republic.
129			//
130			// This Source Code Form is subject to the terms of the Mozilla Public
131			// License, v. 2.0. If a copy of the MPL was not distributed with this
132			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
133
134	0		class empty_node {
135			public:
136			int id; // 0 is root, >0 is sentence word, <0 is undefined
137			int index; // index for the current id, should be numbered from 1, 0=undefined
138			string form; // form
139			string lemma; // lemma
140			string upostag; // universal part-of-speech tag
141			string xpostag; // language-specific part-of-speech tag
142			string feats; // list of morphological features
143			string deps; // secondary dependencies
144			string misc; // miscellaneous information
145
146	0		empty_node(int id = -1, int index = 0) : id(id), index(index) {}
147			};
148
149			/////////
150			// File: sentence/token.h
151			/////////
152
153			// This file is part of UDPipe .
154			//
155			// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of
156			// Mathematics and Physics, Charles University in Prague, Czech Republic.
157			//
158			// This Source Code Form is subject to the terms of the Mozilla Public
159			// License, v. 2.0. If a copy of the MPL was not distributed with this
160			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
161
162	24		class token {
163			public:
164			string form;
165			string misc;
166
167			token(string_piece form = string_piece(), string_piece misc = string_piece());
168
169			// CoNLL-U defined SpaceAfter=No feature
170			bool get_space_after() const;
171			void set_space_after(bool space_after);
172
173			// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
174			void get_spaces_before(string& spaces_before) const;
175			void set_spaces_before(string_piece spaces_before);
176			void get_spaces_after(string& spaces_after) const;
177			void set_spaces_after(string_piece spaces_after);
178			void get_spaces_in_token(string& spaces_in_token) const;
179			void set_spaces_in_token(string_piece spaces_in_token);
180
181			// UDPipe-specific TokenRange feature
182			bool get_token_range(size_t& start, size_t& end) const;
183			void set_token_range(size_t start, size_t end);
184
185			private:
186			bool get_misc_field(string_piece name, string_piece& value) const;
187			void remove_misc_field(string_piece name);
188			string& start_misc_field(string_piece name);
189
190			void append_escaped_spaces(string_piece spaces, string& escaped_spaces) const;
191			void unescape_spaces(string_piece escaped_spaces, string& spaces) const;
192			};
193
194			/////////
195			// File: sentence/multiword_token.h
196			/////////
197
198			// This file is part of UDPipe .
199			//
200			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
201			// Mathematics and Physics, Charles University in Prague, Czech Republic.
202			//
203			// This Source Code Form is subject to the terms of the Mozilla Public
204			// License, v. 2.0. If a copy of the MPL was not distributed with this
205			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
206
207	0	0	class multiword_token : public token {
		0
		0
208			public:
209			// form and misc are inherited from token
210			int id_first, id_last;
211
212			multiword_token(int id_first = -1, int id_last = -1, string_piece form = string_piece(), string_piece misc = string_piece())
213	0		: token(form, misc), id_first(id_first), id_last(id_last) {}
214			};
215
216			/////////
217			// File: sentence/word.h
218			/////////
219
220			// This file is part of UDPipe .
221			//
222			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
223			// Mathematics and Physics, Charles University in Prague, Czech Republic.
224			//
225			// This Source Code Form is subject to the terms of the Mozilla Public
226			// License, v. 2.0. If a copy of the MPL was not distributed with this
227			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
228
229	46	0	class word : public token {
230			public:
231			// form and misc are inherited from token
232			int id; // 0 is root, >0 is sentence word, <0 is undefined
233			string lemma; // lemma
234			string upostag; // universal part-of-speech tag
235			string xpostag; // language-specific part-of-speech tag
236			string feats; // list of morphological features
237			int head; // head, 0 is root, <0 is undefined
238			string deprel; // dependency relation to the head
239			string deps; // secondary dependencies
240
241			vector children;
242
243	20		word(int id = -1, string_piece form = string_piece()) : token(form), id(id), head(-1) {}
244			};
245
246			/////////
247			// File: sentence/sentence.h
248			/////////
249
250			// This file is part of UDPipe .
251			//
252			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
253			// Mathematics and Physics, Charles University in Prague, Czech Republic.
254			//
255			// This Source Code Form is subject to the terms of the Mozilla Public
256			// License, v. 2.0. If a copy of the MPL was not distributed with this
257			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
258
259	0	0	class sentence {
		0
		0
260			public:
261			sentence();
262
263			vector words;
264			vector multiword_tokens;
265			vector empty_nodes;
266			vector comments;
267			static const string root_form;
268
269			// Basic sentence modifications
270			bool empty();
271			void clear();
272			word& add_word(string_piece form = string_piece());
273			void set_head(int id, int head, const string& deprel);
274			void unlink_all_words();
275
276			// CoNLL-U defined comments
277			bool get_new_doc(string* id = nullptr) const;
278			void set_new_doc(bool new_doc, string_piece id = string_piece());
279			bool get_new_par(string* id = nullptr) const;
280			void set_new_par(bool new_par, string_piece id = string_piece());
281			bool get_sent_id(string& id) const;
282			void set_sent_id(string_piece id);
283			bool get_text(string& text) const;
284			void set_text(string_piece text);
285
286			private:
287			bool get_comment(string_piece name, string* value) const;
288			void remove_comment(string_piece name);
289			void set_comment(string_piece name, string_piece value = string_piece());
290			};
291
292			/////////
293			// File: sentence/input_format.h
294			/////////
295
296			// This file is part of UDPipe .
297			//
298			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
299			// Mathematics and Physics, Charles University in Prague, Czech Republic.
300			//
301			// This Source Code Form is subject to the terms of the Mozilla Public
302			// License, v. 2.0. If a copy of the MPL was not distributed with this
303			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
304
305	1		class input_format {
306			public:
307	1		virtual ~input_format() {}
308
309			virtual bool read_block(istream& is, string& block) const = 0;
310			virtual void reset_document(string_piece id = string_piece()) = 0;
311			virtual void set_text(string_piece text, bool make_copy = false) = 0;
312			virtual bool next_sentence(sentence& s, string& error) = 0;
313
314			// Static factory methods
315			static input_format* new_input_format(const string& name);
316			static input_format* new_conllu_input_format(const string& options = string());
317			static input_format* new_generic_tokenizer_input_format(const string& options = string());
318			static input_format* new_horizontal_input_format(const string& options = string());
319			static input_format* new_vertical_input_format(const string& options = string());
320
321			static input_format* new_presegmented_tokenizer(input_format* tokenizer);
322
323			static const string CONLLU_V1;
324			static const string CONLLU_V2;
325			static const string GENERIC_TOKENIZER_NORMALIZED_SPACES;
326			static const string GENERIC_TOKENIZER_PRESEGMENTED;
327			static const string GENERIC_TOKENIZER_RANGES;
328			};
329
330			/////////
331			// File: model/model.h
332			/////////
333
334			// This file is part of UDPipe .
335			//
336			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
337			// Mathematics and Physics, Charles University in Prague, Czech Republic.
338			//
339			// This Source Code Form is subject to the terms of the Mozilla Public
340			// License, v. 2.0. If a copy of the MPL was not distributed with this
341			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
342
343	1		class model {
344			public:
345	1		virtual ~model() {}
346
347			static model* load(const char* fname);
348			static model* load(istream& is);
349
350			virtual input_format* new_tokenizer(const string& options) const = 0;
351			virtual bool tag(sentence& s, const string& options, string& error) const = 0;
352			virtual bool parse(sentence& s, const string& options, string& error) const = 0;
353
354			static const string DEFAULT;
355			static const string TOKENIZER_NORMALIZED_SPACES;
356			static const string TOKENIZER_PRESEGMENTED;
357			static const string TOKENIZER_RANGES;
358			};
359
360			/////////
361			// File: model/evaluator.h
362			/////////
363
364			// This file is part of UDPipe .
365			//
366			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
367			// Mathematics and Physics, Charles University in Prague, Czech Republic.
368			//
369			// This Source Code Form is subject to the terms of the Mozilla Public
370			// License, v. 2.0. If a copy of the MPL was not distributed with this
371			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
372
373			class evaluator {
374			public:
375			evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser);
376
377			void set_model(const model* m);
378			void set_tokenizer(const string& tokenizer);
379			void set_tagger(const string& tagger);
380			void set_parser(const string& parser);
381
382			bool evaluate(istream& is, ostream& os, string& error) const;
383
384			static const string DEFAULT;
385			static const string NONE;
386
387			private:
388			const model* m;
389			string tokenizer, tagger, parser;
390
391			struct f1_info { size_t total_system, total_gold; double precision, recall, f1; };
392			template
393			static f1_info evaluate_f1(const vector>& system, const vector>& gold);
394
395	0		class evaluation_data {
396			public:
397	0		struct word_data {
398			size_t start, end;
399			bool is_multiword;
400			word w;
401
402			word_data(size_t start, size_t end, int id, bool is_multiword, const word& w);
403			};
404
405			void add_sentence(const sentence& s);
406
407			u32string chars;
408			vector> sentences, tokens;
409			vector> multiwords;
410			vector words;
411			};
412
413	0		class word_alignment {
414			public:
415	0		struct pair_system_gold {
416			word system; const word& gold;
417	0	0	pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {}
418			};
419			vector matched;
420			size_t total_system, total_gold;
421
422			template
423			f1_info evaluate_f1(Equals equals);
424
425			static bool perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment);
426			static void best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment);
427			};
428			};
429
430			/////////
431			// File: unilib/unicode.h
432			/////////
433
434			// This file is part of UniLib .
435			//
436			// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
437			// Mathematics and Physics, Charles University in Prague, Czech Republic.
438			//
439			// This Source Code Form is subject to the terms of the Mozilla Public
440			// License, v. 2.0. If a copy of the MPL was not distributed with this
441			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
442			//
443			// UniLib version: 3.3.0
444			// Unicode version: 15.0.0
445
446			namespace unilib {
447
448			class unicode {
449			enum : uint8_t {
450			_Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5,
451			_Mn = 6, _Mc = 7, _Me = 8,
452			_Nd = 9, _Nl = 10, _No = 11,
453			_Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18,
454			_Sm = 19, _Sc = 20, _Sk = 21, _So = 22,
455			_Zs = 23, _Zl = 24, _Zp = 25,
456			_Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30
457			};
458
459			public:
460			typedef uint32_t category_t;
461			enum : category_t {
462			Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu \| Lt, LC = Lu \| Ll \| Lt,
463			Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu \| Ll \| Lt \| Lm \| Lo,
464			Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn \| Mc \| Me,
465			Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd \| Nl \| No,
466			Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi,
467			Pf = 1 << _Pf, Po = 1 << _Po, P = Pc \| Pd \| Ps \| Pe \| Pi \| Pf \| Po,
468			Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm \| Sc \| Sk \| So,
469			Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs \| Zl \| Zp,
470			Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc \| Cf \| Cs \| Co \| Cn
471			};
472
473			static inline category_t category(char32_t chr);
474
475			static inline char32_t lowercase(char32_t chr);
476			static inline char32_t uppercase(char32_t chr);
477			static inline char32_t titlecase(char32_t chr);
478
479			private:
480			static const char32_t CHARS = 0x110000;
481			static const int32_t DEFAULT_CAT = Cn;
482
483			static const uint8_t category_index[CHARS >> 8];
484			static const uint8_t category_block[][256];
485			static const uint8_t othercase_index[CHARS >> 8];
486			static const char32_t othercase_block[][256];
487
488			enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, UPPER_ONLY = 3, LOWER_THEN_UPPER = 4, UPPER_THEN_TITLE = 5, TITLE_THEN_LOWER = 6 };
489			};
490
491			unicode::category_t unicode::category(char32_t chr) {
492	101	0	return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT;
		0
		50
		0
		50
		50
		50
		50
		0
		0
		0
		50
		50
		0
		0
		0
		50
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		50
		50
493			}
494
495	30		char32_t unicode::lowercase(char32_t chr) {
496	30	50	if (chr < CHARS) {
497	30		char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
498	30	100	if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8;
499	28	50	if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8;
500	28	50	if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
501			}
502			return chr;
503			}
504
505	0		char32_t unicode::uppercase(char32_t chr) {
506	0	0	if (chr < CHARS) {
507	0		char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
508	0	0	if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
509	0	0	if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8;
510	0	0	if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8;
511	0	0	if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
512			}
513			return chr;
514			}
515
516			char32_t unicode::titlecase(char32_t chr) {
517			if (chr < CHARS) {
518			char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF];
519			if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8;
520			if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8;
521			if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8;
522			}
523			return chr;
524			}
525
526			} // namespace unilib
527
528			/////////
529			// File: unilib/utf8.h
530			/////////
531
532			// This file is part of UniLib .
533			//
534			// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
535			// Mathematics and Physics, Charles University in Prague, Czech Republic.
536			//
537			// This Source Code Form is subject to the terms of the Mozilla Public
538			// License, v. 2.0. If a copy of the MPL was not distributed with this
539			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
540			//
541			// UniLib version: 3.3.0
542			// Unicode version: 15.0.0
543
544			namespace unilib {
545
546			class utf8 {
547			public:
548			static bool valid(const char* str);
549			static bool valid(const char* str, size_t len);
550			static inline bool valid(const std::string& str);
551
552			static inline char32_t decode(const char*& str);
553			static inline char32_t decode(const char*& str, size_t& len);
554			static inline char32_t first(const char* str);
555			static inline char32_t first(const char* str, size_t len);
556			static inline char32_t first(const std::string& str);
557
558			static void decode(const char* str, std::u32string& decoded);
559			static void decode(const char* str, size_t len, std::u32string& decoded);
560			static inline void decode(const std::string& str, std::u32string& decoded);
561
562			class string_decoder {
563			public:
564			class iterator;
565			inline iterator begin();
566			inline iterator end();
567			private:
568			inline string_decoder(const char* str);
569			const char* str;
570			friend class utf8;
571			};
572			static inline string_decoder decoder(const char* str);
573			static inline string_decoder decoder(const std::string& str);
574
575			class buffer_decoder {
576			public:
577			class iterator;
578			inline iterator begin();
579			inline iterator end();
580			private:
581			inline buffer_decoder(const char* str, size_t len);
582			const char* str;
583			size_t len;
584			friend class utf8;
585			};
586			static inline buffer_decoder decoder(const char* str, size_t len);
587
588			static inline void append(char*& str, char32_t chr);
589			static inline void append(std::string& str, char32_t chr);
590			static void encode(const std::u32string& str, std::string& encoded);
591
592			template static void map(F f, const char* str, std::string& result);
593			template static void map(F f, const char* str, size_t len, std::string& result);
594			template static void map(F f, const std::string& str, std::string& result);
595
596			private:
597			static const char REPLACEMENT_CHAR = '?';
598			};
599
600			bool utf8::valid(const std::string& str) {
601			return valid(str.c_str());
602			}
603
604	54		char32_t utf8::decode(const char*& str) {
605	54	50	if (((unsigned char)str) < 0x80) return (unsigned char)str++;
606	0	0	else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR;
607	0	0	else if (((unsigned char)*str) < 0xE0) {
608	0		char32_t res = (((unsigned char)*str++) & 0x1F) << 6;
609	0	0	if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
610	0		return res + (((unsigned char)*str++) & 0x3F);
611	0	0	} else if (((unsigned char)*str) < 0xF0) {
612	0		char32_t res = (((unsigned char)*str++) & 0x0F) << 12;
613	0	0	if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
614	0		res += (((unsigned char)*str++) & 0x3F) << 6;
615	0	0	if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
616	0		return res + (((unsigned char)*str++) & 0x3F);
617	0	0	} else if (((unsigned char)*str) < 0xF8) {
618	0		char32_t res = (((unsigned char)*str++) & 0x07) << 18;
619	0	0	if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
620	0		res += (((unsigned char)*str++) & 0x3F) << 12;
621	0	0	if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
622	0		res += (((unsigned char)*str++) & 0x3F) << 6;
623	0	0	if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
624	0		return res + (((unsigned char)*str++) & 0x3F);
625	0		} else return ++str, REPLACEMENT_CHAR;
626			}
627
628	145		char32_t utf8::decode(const char*& str, size_t& len) {
629	145	50	if (!len) return 0;
630	145		--len;
631	145	100	if (((unsigned char)str) < 0x80) return (unsigned char)str++;
632	23	50	else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR;
633	23	50	else if (((unsigned char)*str) < 0xE0) {
634	23		char32_t res = (((unsigned char)*str++) & 0x1F) << 6;
635	23	50	if (len <= 0 \|\| ((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		50
		50
636	23		return res + ((--len, ((unsigned char)*str++)) & 0x3F);
637	0	0	} else if (((unsigned char)*str) < 0xF0) {
638	0		char32_t res = (((unsigned char)*str++) & 0x0F) << 12;
639	0	0	if (len <= 0 \|\| ((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
		0
640	0		res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6;
641	0	0	if (len <= 0 \|\| ((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
		0
642	0		return res + ((--len, ((unsigned char)*str++)) & 0x3F);
643	0	0	} else if (((unsigned char)*str) < 0xF8) {
644	0		char32_t res = (((unsigned char)*str++) & 0x07) << 18;
645	0	0	if (len <= 0 \|\| ((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
		0
646	0		res += ((--len, ((unsigned char)*str++)) & 0x3F) << 12;
647	0	0	if (len <= 0 \|\| ((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
		0
648	0		res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6;
649	0	0	if (len <= 0 \|\| ((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) return REPLACEMENT_CHAR;
		0
		0
650	0		return res + ((--len, ((unsigned char)*str++)) & 0x3F);
651	0		} else return ++str, REPLACEMENT_CHAR;
652			}
653
654			char32_t utf8::first(const char* str) {
655	0		return decode(str);
656			}
657
658			char32_t utf8::first(const char* str, size_t len) {
659	0		return decode(str, len);
660			}
661
662			char32_t utf8::first(const std::string& str) {
663			return first(str.c_str());
664			}
665
666			void utf8::decode(const std::string& str, std::u32string& decoded) {
667			decode(str.c_str(), decoded);
668			}
669
670			class utf8::string_decoder::iterator : public std::iterator {
671			public:
672	36		iterator(const char* str) : codepoint(0), next(str) { operator++(); }
673			iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {}
674	54	0	iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; }
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		50
		100
		50
		100
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
675			iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
676			bool operator==(const iterator& other) const { return next == other.next; }
677			bool operator!=(const iterator& other) const { return next != other.next; }
678			const char32_t& operator*() { return codepoint; }
679			private:
680			char32_t codepoint;
681			const char* next;
682			};
683
684			utf8::string_decoder::string_decoder(const char* str) : str(str) {}
685
686			utf8::string_decoder::iterator utf8::string_decoder::begin() {
687			return iterator(str);
688			}
689
690			utf8::string_decoder::iterator utf8::string_decoder::end() {
691			return iterator(nullptr);
692			}
693
694			utf8::string_decoder utf8::decoder(const char* str) {
695			return string_decoder(str);
696			}
697
698			utf8::string_decoder utf8::decoder(const std::string& str) {
699			return string_decoder(str.c_str());
700			}
701
702			class utf8::buffer_decoder::iterator : public std::iterator {
703			public:
704	0		iterator(const char* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); }
705			iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {}
706	0	0	iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; }
		0
707			iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
708			bool operator==(const iterator& other) const { return next == other.next; }
709			bool operator!=(const iterator& other) const { return next != other.next; }
710			const char32_t& operator*() { return codepoint; }
711			private:
712			char32_t codepoint;
713			const char* next;
714			size_t len;
715			};
716
717			utf8::buffer_decoder::buffer_decoder(const char* str, size_t len) : str(str), len(len) {}
718
719			utf8::buffer_decoder::iterator utf8::buffer_decoder::begin() {
720			return iterator(str, len);
721			}
722
723			utf8::buffer_decoder::iterator utf8::buffer_decoder::end() {
724			return iterator(nullptr, 0);
725			}
726
727			utf8::buffer_decoder utf8::decoder(const char* str, size_t len) {
728			return buffer_decoder(str, len);
729			}
730
731			void utf8::append(char*& str, char32_t chr) {
732			if (chr < 0x80) *str++ = chr;
733			else if (chr < 0x800) { str++ = 0xC0 + (chr >> 6); str++ = 0x80 + (chr & 0x3F); }
734			else if (chr < 0x10000) { str++ = 0xE0 + (chr >> 12); str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); }
735			else if (chr < 0x200000) { str++ = 0xF0 + (chr >> 18); str++ = 0x80 + ((chr >> 12) & 0x3F); str++ = 0x80 + ((chr >> 6) & 0x3F); str++ = 0x80 + (chr & 0x3F); }
736			else *str++ = REPLACEMENT_CHAR;
737			}
738
739	30		void utf8::append(std::string& str, char32_t chr) {
740	30	100	if (chr < 0x80) str += chr;
741	5	50	else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); }
742	0	0	else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); }
743	0	0	else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); }
744			else str += REPLACEMENT_CHAR;
745	30		}
746
747	0		template void utf8::map(F f, const char* str, std::string& result) {
748			result.clear();
749
750	0	0	for (char32_t chr; (chr = decode(str)); )
751	0		append(result, f(chr));
752	0		}
753
754	7		template void utf8::map(F f, const char* str, size_t len, std::string& result) {
755			result.clear();
756
757	36	100	while (len)
758	29		append(result, f(decode(str, len)));
759	7		}
760
761			template void utf8::map(F f, const std::string& str, std::string& result) {
762	0	0	map(f, str.c_str(), result);
		0
		0
		0
		0
763			}
764
765			} // namespace unilib
766
767			/////////
768			// File: model/evaluator.cpp
769			/////////
770
771			// This file is part of UDPipe .
772			//
773			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
774			// Mathematics and Physics, Charles University in Prague, Czech Republic.
775			//
776			// This Source Code Form is subject to the terms of the Mozilla Public
777			// License, v. 2.0. If a copy of the MPL was not distributed with this
778			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
779
780	2		const string evaluator::DEFAULT;
781	2		const string evaluator::NONE = "none";
782
783	0		evaluator::evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser) {
784			set_model(m);
785			set_tokenizer(tokenizer);
786			set_tagger(tagger);
787			set_parser(parser);
788	0		}
789
790	0		void evaluator::set_model(const model* m) {
791	0		this->m = m;
792	0		}
793
794	0		void evaluator::set_tokenizer(const string& tokenizer) {
795	0		this->tokenizer = tokenizer;
796	0		}
797
798	0		void evaluator::set_tagger(const string& tagger) {
799	0		this->tagger = tagger;
800	0		}
801
802	0		void evaluator::set_parser(const string& parser) {
803	0		this->parser = parser;
804	0		}
805
806	0		bool evaluator::evaluate(istream& is, ostream& os, string& error) const {
807			error.clear();
808
809	0	0	unique_ptr conllu_input(input_format::new_conllu_input_format());
810	0	0	if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false;
		0
811
812	0	0	vector plain_text_paragraphs(1); unsigned space_after_nos = 0;
813	0	0	sentence system, gold;
		0
814	0		evaluation_data gold_data, system_goldtok_data, system_goldtok_goldtags_data, system_plaintext_data;
815
816			string block;
817	0	0	while (conllu_input->read_block(is, block)) {
		0
818	0	0	conllu_input->set_text(block);
819	0	0	while (conllu_input->next_sentence(gold, error)) {
		0
820	0	0	gold_data.add_sentence(gold);
821
822			// Detokenize the input when tokenizing
823	0	0	if (tokenizer != NONE) {
824	0	0	if (gold.get_new_doc() \|\| gold.get_new_par()) {
		0
		0
		0
		0
825	0	0	plain_text_paragraphs.back().append("\n\n");
826	0	0	plain_text_paragraphs.emplace_back();
827			}
828
829	0	0	for (size_t i = 1, j = 0; i < gold.words.size(); i++) {
830	0	0	const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i];
		0
831			plain_text_paragraphs.back().append(tok.form);
832	0	0	if (tok.get_space_after())
		0
833	0	0	plain_text_paragraphs.back().push_back(' ');
834			else
835	0		space_after_nos += 1;
836	0	0	if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i))
		0
		0
837	0		i = gold.multiword_tokens[j++].id_last;
838			}
839			}
840
841			// Goldtok data
842	0	0	if (tokenizer == NONE && tagger != NONE) {
843	0	0	system.clear();
844	0	0	for (size_t i = 1; i < gold.words.size(); i++)
845			system.add_word(gold.words[i].form);
846
847	0	0	if (tagger != NONE) {
848	0	0	if (!m->tag(system, tagger, error))
		0
849			return false;
850	0	0	if (parser != NONE)
851	0	0	if (!m->parse(system, parser, error))
		0
852			return false;
853			}
854	0	0	system_goldtok_data.add_sentence(system);
855			}
856
857			// Goldtok_goldtags data
858	0	0	if (tokenizer == NONE && tagger == NONE && parser != NONE) {
		0
859	0	0	system.clear();
860	0	0	for (size_t i = 1; i < gold.words.size(); i++) {
861			system.add_word(gold.words[i].form);
862	0		system.words[i].upostag = gold.words[i].upostag;
863	0		system.words[i].xpostag = gold.words[i].xpostag;
864	0		system.words[i].feats = gold.words[i].feats;
865	0		system.words[i].lemma = gold.words[i].lemma;
866			}
867	0	0	if (parser != NONE)
868	0	0	if (!m->parse(system, parser, error))
		0
869			return false;
870	0	0	system_goldtok_goldtags_data.add_sentence(system);
871			}
872			}
873	0	0	if (!error.empty()) return false;
874			}
875
876			// Tokenize, tag and parse plaintext input
877	0	0	if (tokenizer != NONE) {
878	0	0	unique_ptr t(m->new_tokenizer(tokenizer));
879	0	0	if (!t) return error.assign("Cannot allocate new tokenizer!"), false;
		0
880
881	0	0	for (auto&& plain_text : plain_text_paragraphs) {
882	0	0	t->set_text(plain_text);
883	0	0	while (t->next_sentence(system, error)) {
		0
884	0	0	if (tagger != NONE) {
885	0	0	if (!m->tag(system, tagger, error))
		0
886			return false;
887
888	0	0	if (parser != NONE)
889	0	0	if (!m->parse(system, parser, error))
		0
890			return false;
891			}
892	0	0	system_plaintext_data.add_sentence(system);
893			}
894	0	0	if (!error.empty()) return false;
895			}
896			}
897
898			// Evaluate from plain text
899	0	0	if (tokenizer != NONE) {
900	0	0	if (system_plaintext_data.chars != gold_data.chars) {
901			os << "Cannot evaluate tokenizer, it returned different sequence of token characters!" << endl;
902			} else {
903			word_alignment plaintext_alignment;
904	0	0	word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment);
905
906			os << "Number of SpaceAfter=No features in gold data: " << space_after_nos << endl;
907
908	0		auto tokens = evaluate_f1(system_plaintext_data.tokens, gold_data.tokens);
909	0		auto multiwords = evaluate_f1(system_plaintext_data.multiwords, gold_data.multiwords);
910	0		auto sentences = evaluate_f1(system_plaintext_data.sentences, gold_data.sentences);
911	0		auto words = plaintext_alignment.evaluate_f1([](const word&, const word&) {return true;});
912	0	0	if (multiwords.total_gold \|\| multiwords.total_system)
		0
913	0		os << "Tokenizer tokens - system: " << tokens.total_system << ", gold: " << tokens.total_gold
914	0		<< ", precision: " << fixed << setprecision(2) << 100. * tokens.precision
915	0		<< "%, recall: " << 100. * tokens.recall << "%, f1: " << 100. * tokens.f1 << "%" << endl
916			<< "Tokenizer multiword tokens - system: " << multiwords.total_system << ", gold: " << multiwords.total_gold
917	0		<< ", precision: " << fixed << setprecision(2) << 100. * multiwords.precision
918	0		<< "%, recall: " << 100. * multiwords.recall << "%, f1: " << 100. * multiwords.f1 << "%" << endl;
919	0		os << "Tokenizer words - system: " << words.total_system << ", gold: " << words.total_gold
920	0		<< ", precision: " << fixed << setprecision(2) << 100. * words.precision
921	0		<< "%, recall: " << 100. * words.recall << "%, f1: " << 100. * words.f1 << "%" << endl
922	0		<< "Tokenizer sentences - system: " << sentences.total_system << ", gold: " << sentences.total_gold
923	0		<< ", precision: " << fixed << setprecision(2) << 100. * sentences.precision
924	0		<< "%, recall: " << 100. * sentences.recall << "%, f1: " << 100. * sentences.f1 << "%" << endl;
925
926	0	0	if (tagger != NONE) {
927	0		auto upostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; });
928	0		auto xpostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; });
929	0		auto feats = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; });
930	0	0	auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
		0
		0
931	0		auto lemmas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; });
932	0		os << "Tagging from plain text (CoNLL17 F1 score) - gold forms: " << upostags.total_gold << ", upostag: "
933	0		<< fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: "
934	0		<< 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: "
935	0		<< 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl;
936			}
937
938	0	0	if (tagger != NONE && parser != NONE) {
939	0		auto uas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; });
940	0	0	auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
		0
941	0		os << "Parsing from plain text with computed tags (CoNLL17 F1 score) - gold forms: " << uas.total_gold
942	0		<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl;
943			}
944			}
945			}
946
947			// Evaluate tagger from gold tokenization
948	0	0	if (tokenizer == NONE && tagger != NONE) {
949			word_alignment goldtok_alignment;
950	0	0	if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment))
		0
951	0	0	return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false;
952
953	0		auto upostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; });
954	0		auto xpostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; });
955	0		auto feats = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; });
956	0	0	auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; });
		0
		0
957	0		auto lemmas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; });
958	0		os << "Tagging from gold tokenization - forms: " << upostags.total_gold << ", upostag: "
959	0		<< fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: "
960	0		<< 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: "
961	0		<< 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl;
962
963	0	0	if (parser != NONE) {
964	0		auto uas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; });
965	0	0	auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
		0
966	0		os << "Parsing from gold tokenization with computed tags - forms: " << uas.total_gold
967	0		<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl;
968			}
969			}
970
971			// Evaluate parser from gold tokenization
972	0	0	if (tokenizer == NONE && tagger == NONE && parser != NONE) {
		0
973			word_alignment goldtok_goldtags_alignment;
974	0	0	if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment))
		0
975	0	0	return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false;
976
977	0		auto uas = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; });
978	0	0	auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; });
		0
979	0		os << "Parsing from gold tokenization with gold tags - forms: " << uas.total_gold
980	0		<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl;
981			}
982
983			return true;
984			}
985
986			template
987	0		evaluator::f1_info evaluator::evaluate_f1(const vector>& system, const vector>& gold) {
988			size_t both = 0;
989	0	0	for (size_t si = 0, gi = 0; si < system.size() \|\| gi < gold.size(); )
		0
		0
		0
		0
		0
990	0	0	if (si < system.size() && (gi == gold.size() \|\| system[si].first < gold[gi].first))
		0
		0
		0
		0
		0
		0
		0
991	0		si++;
992	0	0	else if (gi < gold.size() && (si == system.size() \|\| gold[gi].first < system[si].first))
		0
		0
		0
		0
		0
		0
		0
993	0		gi++;
994			else
995	0		both += system[si++].second == gold[gi++].second;
996
997			return {system.size(), gold.size(), system.size() ? both / double(system.size()) : 0.,
998	0	0	gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. };
		0
		0
		0
		0
		0
999			}
1000
1001	0		evaluator::evaluation_data::word_data::word_data(size_t start, size_t end, int id, bool is_multiword, const word& w)
1002	0		: start(start), end(end), is_multiword(is_multiword), w(w)
1003			{
1004			// Use absolute ids for words and heads
1005	0		this->w.id = id;
1006	0	0	this->w.head = w.head ? id + (w.head - w.id) : 0;
1007
1008			// Forms in MWTs are compares case-insensitively in LCS, therefore
1009			// we lowercase them here.
1010	0		unilib::utf8::map(unilib::unicode::lowercase, w.form, this->w.form);
1011
1012			// During evaluation, only universal part of DEPREL (up to a colon) is used.
1013	0		auto colon = w.deprel.find(':');
1014	0	0	if (colon != string::npos)
1015	0	0	this->w.deprel.erase(colon);
1016	0		}
1017
1018	0		void evaluator::evaluation_data::add_sentence(const sentence& s) {
1019	0		sentences.emplace_back(chars.size(), chars.size());
1020	0	0	for (size_t i = 1, j = 0; i < s.words.size(); i++) {
1021	0		tokens.emplace_back(chars.size(), chars.size());
1022	0	0	const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form;
		0
1023	0	0	for (auto&& chr : unilib::utf8::decoder(form))
1024	0	0	if (chr != ' ')
1025	0		chars.push_back(chr);
1026	0		tokens.back().second = chars.size();
1027
1028	0	0	if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) {
		0
		0
1029	0		multiwords.emplace_back(tokens.back().first, form);
1030	0	0	for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) {
1031	0		words.emplace_back(tokens.back().first, tokens.back().second, (int)words.size() + 1, true, s.words[k]);
1032	0		multiwords.back().second.append(" ").append(words.back().w.form);
1033			}
1034	0		i = s.multiword_tokens[j++].id_last;
1035			} else {
1036	0		words.emplace_back(tokens.back().first, tokens.back().second, (int)words.size() + 1, false, s.words[i]);
1037			}
1038			}
1039	0		sentences.back().second = chars.size();
1040	0		}
1041
1042			template
1043	0		evaluator::f1_info evaluator::word_alignment::evaluate_f1(Equals equals) {
1044			size_t both = 0;
1045	0	0	for (auto&& match : matched)
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1046	0	0	if (equals(match.system, match.gold))
		0
		0
		0
1047	0		both++;
1048
1049			return {total_system, total_gold, total_system ? both / double(total_system) : 0.,
1050	0	0	total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. };
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1051			}
1052
1053	0		bool evaluator::word_alignment::perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) {
1054	0		alignment.total_system = system.words.size();
1055	0		alignment.total_gold = gold.words.size();
1056	0	0	if (alignment.total_system != alignment.total_gold) return false;
1057
1058			alignment.matched.clear();
1059	0		alignment.matched.reserve(alignment.total_system);
1060	0	0	for (size_t i = 0; i < system.words.size(); i++) {
1061	0	0	if (system.words[i].w.form != gold.words[i].w.form)
1062			return false;
1063	0		alignment.matched.emplace_back(system.words[i].w, gold.words[i].w);
1064			}
1065
1066			return true;
1067			}
1068
1069	0		void evaluator::word_alignment::best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) {
1070	0		alignment.total_system = system.words.size();
1071	0		alignment.total_gold = gold.words.size();
1072			alignment.matched.clear();
1073
1074	0	0	for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); )
		0
		0
1075	0	0	if ((system.words[si].start > gold.words[gi].start \|\| !system.words[si].is_multiword) &&
		0
		0
		0
1076	0	0	(gold.words[gi].start > system.words[si].start \|\| !gold.words[gi].is_multiword)) {
1077			// No multiword, align using start+end indices
1078	0	0	if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end)
		0
		0
1079	0		alignment.matched.emplace_back(system.words[si++].w, gold.words[gi++].w);
1080	0	0	else if (system.words[si].start <= gold.words[gi].start)
1081	0		si++;
1082			else
1083	0		gi++;
1084			} else {
1085			// We have a multiword
1086	0	0	size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end;
1087
1088			// Find all words in the multiword range
1089	0	0	while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end :
		0
		0
1090	0	0	system.words[si].end <= multiword_range_end)) \|\|
		0
		0
		0
1091	0	0	(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end :
		0
1092	0		gold.words[gi].end <= multiword_range_end))) {
1093			// Extend the multiword range
1094	0	0	if (si < system.words.size() && (gi >= gold.words.size() \|\| system.words[si].start <= gold.words[gi].start)) {
		0
		0
		0
1095	0	0	if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end);
1096	0		si++;
1097			} else {
1098	0	0	if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end);
1099	0		gi++;
1100			}
1101			}
1102
1103			// LCS on the chosen words
1104	0		vector> lcs(si - ss);
1105	0	0	for (unsigned s = si - ss; s--; ) {
1106	0	0	lcs[s].resize(gi - gs);
1107	0	0	for (unsigned g = gi - gs; g--; ) {
1108	0	0	lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0);
1109	0	0	lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0);
1110	0	0	if (system.words[ss + s].w.form == gold.words[gs + g].w.form)
1111	0	0	lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0));
		0
1112			}
1113			}
1114
1115	0	0	for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) {
		0
1116	0	0	if (system.words[ss + s].w.form == gold.words[gs + g].w.form)
1117	0	0	alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w);
1118	0	0	else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0))
		0
1119			s++;
1120			else /* if (lcs[s][g] == (g+1 < lcs[s].size() ? lcs[s][g+1] : 0)) */
1121	0		g++;
1122			}
1123			}
1124
1125			// Reindex HEAD pointers in system to use gold indices
1126	0		vector gold_aligned(system.words.size(), -1);
1127	0	0	for (auto&& match : alignment.matched)
1128	0		gold_aligned[match.system.id - 1] = match.gold.id;
1129	0	0	for (auto&& match : alignment.matched)
1130	0	0	if (match.system.head > 0)
1131	0		match.system.head = gold_aligned[match.system.head - 1];
1132	0		}
1133
1134			/////////
1135			// File: morphodita/derivator/derivator.h
1136			/////////
1137
1138			// This file is part of MorphoDiTa .
1139			//
1140			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
1141			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1142			//
1143			// This Source Code Form is subject to the terms of the Mozilla Public
1144			// License, v. 2.0. If a copy of the MPL was not distributed with this
1145			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1146
1147			namespace morphodita {
1148
1149	0		struct derivated_lemma {
1150			string lemma;
1151			};
1152
1153	0		class derivator {
1154			public:
1155	0		virtual ~derivator() {}
1156
1157			// For given lemma, return the parent in the derivation graph.
1158			// The lemma is assumed to be lemma id and any lemma comments are ignored.
1159			virtual bool parent(string_piece lemma, derivated_lemma& parent) const = 0;
1160
1161			// For given lemma, return the children in the derivation graph.
1162			// The lemma is assumed to be lemma id and any lemma comments are ignored.
1163			virtual bool children(string_piece lemma, vector& children) const = 0;
1164			};
1165
1166			} // namespace morphodita
1167
1168			/////////
1169			// File: morphodita/tokenizer/tokenizer.h
1170			/////////
1171
1172			// This file is part of MorphoDiTa .
1173			//
1174			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1175			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1176			//
1177			// This Source Code Form is subject to the terms of the Mozilla Public
1178			// License, v. 2.0. If a copy of the MPL was not distributed with this
1179			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1180
1181			namespace morphodita {
1182
1183			// Range of a token, measured in Unicode characters, not UTF8 bytes.
1184			struct token_range {
1185			size_t start;
1186			size_t length;
1187
1188			token_range() {}
1189	7		token_range(size_t start, size_t length) : start(start), length(length) {}
1190			};
1191
1192	1		class tokenizer {
1193			public:
1194	1		virtual ~tokenizer() {}
1195
1196			virtual void set_text(string_piece text, bool make_copy = false) = 0;
1197			virtual bool next_sentence(vector* forms, vector* tokens) = 0;
1198
1199			// Static factory methods
1200			static tokenizer* new_vertical_tokenizer();
1201
1202			static tokenizer* new_czech_tokenizer();
1203			static tokenizer* new_english_tokenizer();
1204			static tokenizer* new_generic_tokenizer();
1205			};
1206
1207			} // namespace morphodita
1208
1209			/////////
1210			// File: morphodita/morpho/morpho.h
1211			/////////
1212
1213			// This file is part of MorphoDiTa .
1214			//
1215			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1216			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1217			//
1218			// This Source Code Form is subject to the terms of the Mozilla Public
1219			// License, v. 2.0. If a copy of the MPL was not distributed with this
1220			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1221
1222			namespace morphodita {
1223
1224	0		struct tagged_form {
1225			string form;
1226			string tag;
1227
1228			tagged_form() {}
1229	0		tagged_form(const string& form, const string& tag) : form(form), tag(tag) {}
1230			};
1231
1232	46		struct tagged_lemma {
1233			string lemma;
1234			string tag;
1235
1236			tagged_lemma() {}
1237	10		tagged_lemma(const string& lemma, const string& tag) : lemma(lemma), tag(tag) {}
1238			};
1239
1240	0		struct tagged_lemma_forms {
1241			string lemma;
1242			vector forms;
1243
1244			tagged_lemma_forms() {}
1245	0		tagged_lemma_forms(const string& lemma) : lemma(lemma) {}
1246			};
1247
1248	1		class morpho {
1249			public:
1250	2		virtual ~morpho() {}
1251
1252			static morpho* load(istream& is);
1253			static morpho* load(const char* fname);
1254
1255			enum guesser_mode { NO_GUESSER = 0, GUESSER = 1, GUESSER_UNSPECIFIED = -1 };
1256
1257			// Perform morphologic analysis of a form. The form is given by a pointer and
1258			// length and therefore does not need to be '\0' terminated. The guesser
1259			// parameter specifies whether a guesser can be used if the form is not found
1260			// in the dictionary. Output is assigned to the lemmas vector.
1261			//
1262			// If the form is found in the dictionary, analyses are assigned to lemmas
1263			// and NO_GUESSER returned. If guesser == GUESSER and the form analyses are
1264			// found using a guesser, they are assigned to lemmas and GUESSER is
1265			// returned. Otherwise <0 is returned and lemmas are filled with one
1266			// analysis containing given form as lemma and a tag for unknown word.
1267			virtual int analyze(string_piece form, guesser_mode guesser, vector& lemmas) const = 0;
1268
1269			// Perform morphologic generation of a lemma. The lemma is given by a pointer
1270			// and length and therefore does not need to be '\0' terminated. Optionally
1271			// a tag_wildcard can be specified (or be NULL) and if so, results are
1272			// filtered using this wildcard. The guesser parameter speficies whether
1273			// a guesser can be used if the lemma is not found in the dictionary. Output
1274			// is assigned to the forms vector.
1275			//
1276			// Tag_wildcard can be either NULL or a wildcard applied to the results.
1277			// A ? in the wildcard matches any character, [bytes] matches any of the
1278			// bytes and [^bytes] matches any byte different from the specified ones.
1279			// A - has no special meaning inside the bytes and if ] is first in bytes, it
1280			// does not end the bytes group.
1281			//
1282			// If the given lemma is only a raw lemma, all lemma ids with this raw lemma
1283			// are returned. Otherwise only matching lemma ids are returned, ignoring any
1284			// lemma comments. For every found lemma, matching forms are filtered using
1285			// the tag_wildcard. If at least one lemma is found in the dictionary,
1286			// NO_GUESSER is returned. If guesser == GUESSER and the lemma is found by
1287			// the guesser, GUESSER is returned. Otherwise, forms are cleared and <0 is
1288			// returned.
1289			virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const = 0;
1290
1291			// Rawlemma and lemma id identification
1292			virtual int raw_lemma_len(string_piece lemma) const = 0;
1293			virtual int lemma_id_len(string_piece lemma) const = 0;
1294
1295			// Rawform identification
1296			virtual int raw_form_len(string_piece form) const = 0;
1297
1298			// Construct a new tokenizer instance appropriate for this morphology.
1299			// Can return NULL if no such tokenizer exists.
1300			virtual tokenizer* new_tokenizer() const = 0;
1301
1302			// Return a derivator for this morphology, or NULL if it does not exist.
1303			// The returned instance is owned by the morphology and should not be deleted.
1304			virtual const derivator* get_derivator() const;
1305
1306			protected:
1307			unique_ptr derinet;
1308			};
1309
1310			} // namespace morphodita
1311
1312			/////////
1313			// File: morphodita/tokenizer/tokenizer_factory.h
1314			/////////
1315
1316			// This file is part of MorphoDiTa .
1317			//
1318			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
1319			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1320			//
1321			// This Source Code Form is subject to the terms of the Mozilla Public
1322			// License, v. 2.0. If a copy of the MPL was not distributed with this
1323			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1324
1325			namespace morphodita {
1326
1327	1		class tokenizer_factory {
1328			public:
1329	1		virtual ~tokenizer_factory() {}
1330
1331			static tokenizer_factory* load(istream& is);
1332			static tokenizer_factory* load(const char* fname);
1333
1334			// Construct a new tokenizer instance.
1335			virtual tokenizer* new_tokenizer(const morpho* m) const = 0;
1336			};
1337
1338			} // namespace morphodita
1339
1340			/////////
1341			// File: morphodita/tagger/tagger.h
1342			/////////
1343
1344			// This file is part of MorphoDiTa .
1345			//
1346			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1347			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1348			//
1349			// This Source Code Form is subject to the terms of the Mozilla Public
1350			// License, v. 2.0. If a copy of the MPL was not distributed with this
1351			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1352
1353			namespace morphodita {
1354
1355	1		class tagger {
1356			public:
1357	1		virtual ~tagger() {}
1358
1359			static tagger* load(const char* fname);
1360			static tagger* load(istream& is);
1361
1362			// Return morpho associated with the tagger. Do not delete the pointer, it is
1363			// owned by the tagger instance and deleted in the tagger destructor.
1364			virtual const morpho* get_morpho() const = 0;
1365
1366			// Perform morphologic analysis and subsequent disambiguation.
1367			virtual void tag(const vector& forms, vector& tags, morpho::guesser_mode guesser = morpho::GUESSER_UNSPECIFIED) const = 0;
1368
1369			// Perform disambiguation only on given analyses.
1370			virtual void tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const = 0;
1371
1372			// Construct a new tokenizer instance appropriate for this tagger.
1373			// Can return NULL if no such tokenizer exists.
1374			// Is equal to get_morpho()->new_tokenizer.
1375			tokenizer* new_tokenizer() const;
1376			};
1377
1378			} // namespace morphodita
1379
1380			/////////
1381			// File: parsito/tree/node.h
1382			/////////
1383
1384			// This file is part of Parsito .
1385			//
1386			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1387			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1388			//
1389			// This Source Code Form is subject to the terms of the Mozilla Public
1390			// License, v. 2.0. If a copy of the MPL was not distributed with this
1391			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1392
1393			namespace parsito {
1394
1395	23	0	class node {
1396			public:
1397			int id; // 0 is root, >0 is sentence node, <0 is undefined
1398			string form; // form
1399			string lemma; // lemma
1400			string upostag; // universal part-of-speech tag
1401			string xpostag; // language-specific part-of-speech tag
1402			string feats; // list of morphological features
1403			int head; // head, 0 is root, <0 is without parent
1404			string deprel; // dependency relation to the head
1405			string deps; // secondary dependencies
1406			string misc; // miscellaneous information
1407
1408			vector children;
1409
1410	9		node(int id = -1, const string& form = string()) : id(id), form(form), head(-1) {}
1411			};
1412
1413			} // namespace parsito
1414
1415			/////////
1416			// File: parsito/tree/tree.h
1417			/////////
1418
1419			// This file is part of Parsito .
1420			//
1421			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1422			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1423			//
1424			// This Source Code Form is subject to the terms of the Mozilla Public
1425			// License, v. 2.0. If a copy of the MPL was not distributed with this
1426			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1427
1428			namespace parsito {
1429
1430	1	0	class tree {
		0
		0
		0
		0
1431			public:
1432			tree();
1433
1434			vector nodes;
1435
1436			bool empty();
1437			void clear();
1438			node& add_node(const string& form);
1439			void set_head(int id, int head, const string& deprel);
1440			void unlink_all_nodes();
1441
1442			static const string root_form;
1443			};
1444
1445			} // namespace parsito
1446
1447			/////////
1448			// File: parsito/configuration/configuration.h
1449			/////////
1450
1451			// This file is part of Parsito .
1452			//
1453			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1454			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1455			//
1456			// This Source Code Form is subject to the terms of the Mozilla Public
1457			// License, v. 2.0. If a copy of the MPL was not distributed with this
1458			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1459
1460			namespace parsito {
1461
1462	167		class configuration {
1463			public:
1464	11		configuration(bool single_root) : single_root(single_root) {}
1465
1466			void init(tree* t);
1467			bool final();
1468
1469			tree* t;
1470			vector stack;
1471			vector buffer;
1472
1473			bool single_root;
1474			};
1475
1476			} // namespace parsito
1477
1478			/////////
1479			// File: utils/binary_decoder.h
1480			/////////
1481
1482			// This file is part of UFAL C++ Utils .
1483			//
1484			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1485			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1486			//
1487			// This Source Code Form is subject to the terms of the Mozilla Public
1488			// License, v. 2.0. If a copy of the MPL was not distributed with this
1489			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1490
1491			namespace utils {
1492
1493			//
1494			// Declarations
1495			//
1496
1497	0		class binary_decoder_error : public runtime_error {
1498			public:
1499	0	0	explicit binary_decoder_error(const char* description) : runtime_error(description) {}
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
1500			};
1501
1502	5	0	class binary_decoder {
		0
		0
		0
		0
		0
		0
		0
		0
1503			public:
1504			inline unsigned char* fill(unsigned len);
1505
1506			inline unsigned next_1B();
1507			inline unsigned next_2B();
1508			inline unsigned next_4B();
1509			inline void next_str(string& str);
1510			template inline const T* next(unsigned elements);
1511
1512			inline bool is_end();
1513			inline unsigned tell();
1514			inline void seek(unsigned pos);
1515
1516			private:
1517			vector buffer;
1518			const unsigned char* data;
1519			const unsigned char* data_end;
1520			};
1521
1522			//
1523			// Definitions
1524			//
1525
1526			unsigned char* binary_decoder::fill(unsigned len) {
1527	6	50	buffer.resize(len);
1528	6		data = buffer.data();
1529	6		data_end = buffer.data() + len;
1530
1531			return buffer.data();
1532			}
1533
1534	2616		unsigned binary_decoder::next_1B() {
1535	1308	50	if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder");
1536	1308		return *data++;
1537			}
1538
1539	26		unsigned binary_decoder::next_2B() {
1540	26	50	if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder");
1541			uint16_t result;
1542			memcpy(&result, data, sizeof(uint16_t));
1543	26		data += sizeof(uint16_t);
1544	26		return result;
1545			}
1546
1547	1573		unsigned binary_decoder::next_4B() {
1548	1573	50	if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder");
1549			uint32_t result;
1550			memcpy(&result, data, sizeof(uint32_t));
1551	1573		data += sizeof(uint32_t);
1552	1573		return result;
1553			}
1554
1555	36		void binary_decoder::next_str(string& str) {
1556	36		unsigned len = next_1B();
1557	36	100	if (len == 255) len = next_4B();
1558	36		str.assign(next(len), len);
1559	36		}
1560
1561	2544		template const T* binary_decoder::next(unsigned elements) {
1562	1272	50	if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder");
		50
		50
1563			const T* result = (const T*) data;
1564	1272		data += sizeof(T) * elements;
1565	1272		return result;
1566			}
1567
1568			bool binary_decoder::is_end() {
1569	4		return data >= data_end;
1570			}
1571
1572			unsigned binary_decoder::tell() {
1573	1		return data - buffer.data();
1574			}
1575
1576	1		void binary_decoder::seek(unsigned pos) {
1577	1	50	if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder");
1578	1		data = buffer.data() + pos;
1579	1		}
1580
1581			} // namespace utils
1582
1583			/////////
1584			// File: parsito/parser/parser.h
1585			/////////
1586
1587			// This file is part of Parsito .
1588			//
1589			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1590			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1591			//
1592			// This Source Code Form is subject to the terms of the Mozilla Public
1593			// License, v. 2.0. If a copy of the MPL was not distributed with this
1594			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1595
1596			namespace parsito {
1597
1598			// Parser
1599	1		class parser {
1600			public:
1601	1		virtual ~parser() {};
1602
1603			virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const = 0;
1604
1605			enum { NO_CACHE = 0, FULL_CACHE = 2147483647};
1606			static parser* load(const char* file, unsigned cache = 1000);
1607			static parser* load(istream& in, unsigned cache = 1000);
1608
1609			protected:
1610			virtual void load(binary_decoder& data, unsigned cache) = 0;
1611			static parser* create(const string& name);
1612			};
1613
1614			} // namespace parsito
1615
1616			/////////
1617			// File: tokenizer/multiword_splitter.h
1618			/////////
1619
1620			// This file is part of UDPipe .
1621			//
1622			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
1623			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1624			//
1625			// This Source Code Form is subject to the terms of the Mozilla Public
1626			// License, v. 2.0. If a copy of the MPL was not distributed with this
1627			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1628
1629	1		class multiword_splitter {
1630			public:
1631			void append_token(string_piece token, string_piece misc, sentence& s) const;
1632
1633			static multiword_splitter* load(istream& is);
1634
1635			private:
1636	1		multiword_splitter(unsigned version) : version(version) {}
1637			unsigned version;
1638			enum { VERSION_LATEST = 2 };
1639			friend class multiword_splitter_trainer;
1640
1641	0		struct suffix_info {
1642			vector words;
1643			};
1644			unordered_map full_rules, suffix_rules;
1645			};
1646
1647			/////////
1648			// File: utils/parse_int.h
1649			/////////
1650
1651			// This file is part of UFAL C++ Utils .
1652			//
1653			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1654			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1655			//
1656			// This Source Code Form is subject to the terms of the Mozilla Public
1657			// License, v. 2.0. If a copy of the MPL was not distributed with this
1658			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1659
1660			namespace utils {
1661
1662			//
1663			// Declarations
1664			//
1665
1666			// Try to parse an int from given string. If the int cannot be parsed or does
1667			// not fit into int, false is returned and the error string is filled using the
1668			// value_name argument.
1669			inline bool parse_int(string_piece str, const char* value_name, int& value, string& error);
1670
1671			// Try to parse an int from given string. If the int cannot be parsed or does
1672			// not fit into int, an error is displayed and program exits.
1673			inline int parse_int(string_piece str, const char* value_name);
1674
1675			//
1676			// Definitions
1677			//
1678
1679	68		bool parse_int(string_piece str, const char* value_name, int& value, string& error) {
1680			string_piece original = str;
1681
1682			// Skip spaces
1683	34	50	while (str.len && (str.str[0] == ' ' \|\| str.str[0] == '\f' \|\| str.str[0] == '\n' \|\| str.str[0] == '\r' \|\| str.str[0] == '\t' \|\| str.str[0] == '\v'))
		50
		50
		50
		50
		50
1684	0		str.str++, str.len--;
1685
1686			// Allow minus
1687			bool positive = true;
1688	34	50	if (str.len && (str.str[0] == '+' \|\| str.str[0] == '-')) {
		100
1689			positive = str.str[0] == '+';
1690	8		str.str++, str.len--;
1691			}
1692
1693			// Parse value, checking for overflow/underflow
1694	34	50	if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false;
1695			if (!(str.str[0] >= '0' \|\| str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false;
1696
1697	34		value = 0;
1698	68	100	while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
		50
		50
1699	34	100	if (positive) {
1700	26	50	if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10)
1701	0		return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': overflow occured."), false;
1702	26		value = 10 * value + (str.str[0] - '0');
1703			} else {
1704	8	50	if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10)
1705	0		return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': underflow occured."), false;
1706	8		value = 10 * value - (str.str[0] - '0');
1707			}
1708	34		str.str++, str.len--;
1709			}
1710
1711			// Skip spaces
1712	34	50	while (str.len && (str.str[0] == ' ' \|\| str.str[0] == '\f' \|\| str.str[0] == '\n' \|\| str.str[0] == '\r' \|\| str.str[0] == '\t' \|\| str.str[0] == '\v'))
		0
		0
		0
		0
		0
1713	0		str.str++, str.len--;
1714
1715			// Check for remaining characters
1716	34	50	if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false;
1717
1718			return true;
1719			}
1720
1721	0		int parse_int(string_piece str, const char* value_name) {
1722			int result;
1723			string error;
1724
1725	0	0	if (!parse_int(str, value_name, result, error))
		0
1726	0		runtime_failure(error);
1727
1728	0		return result;
1729			}
1730
1731			} // namespace utils
1732
1733			/////////
1734			// File: utils/path_from_utf8.h
1735			/////////
1736
1737			// This file is part of UFAL C++ Utils .
1738			//
1739			// Copyright 2022 Institute of Formal and Applied Linguistics, Faculty of
1740			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1741			//
1742			// This Source Code Form is subject to the terms of the Mozilla Public
1743			// License, v. 2.0. If a copy of the MPL was not distributed with this
1744			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1745
1746			namespace utils {
1747
1748			//
1749			// Declarations
1750			//
1751
1752			#ifdef _WIN32
1753			inline wstring path_from_utf8(const char* str);
1754			inline wstring path_from_utf8(const string& str);
1755			#else
1756			inline string path_from_utf8(const char* str);
1757			inline const string& path_from_utf8(const string& str);
1758			#endif
1759
1760			//
1761			// Definitions
1762			//
1763
1764			#ifdef _WIN32
1765
1766			inline wstring path_from_utf8(const char* str) {
1767			// We could implement this using codecvt_utf8_utf16, but it is not available
1768			// in GCC 4.9, which we still use. We could also use MultiByteToWideChar,
1769			// but using it would require changing our build infrastructure -- hence
1770			// we implement the conversion manually.
1771			wstring wstr;
1772			while (*str) {
1773			char32_t chr;
1774			if (((unsigned char)str) < 0x80) chr = (unsigned char)str++;
1775			else if (((unsigned char)*str) < 0xC0) chr = '?', ++str;
1776			else if (((unsigned char)*str) < 0xE0) {
1777			chr = (((unsigned char)*str++) & 0x1F) << 6;
1778			if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) chr = '?';
1779			else chr += ((unsigned char)*str++) & 0x3F;
1780			} else if (((unsigned char)*str) < 0xF0) {
1781			chr = (((unsigned char)*str++) & 0x0F) << 12;
1782			if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) chr = '?';
1783			else {
1784			chr += (((unsigned char)*str++) & 0x3F) << 6;
1785			if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) chr = '?';
1786			else chr += ((unsigned char)*str++) & 0x3F;
1787			}
1788			} else if (((unsigned char)*str) < 0xF8) {
1789			chr = (((unsigned char)*str++) & 0x07) << 18;
1790			if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) chr = '?';
1791			else {
1792			chr += (((unsigned char)*str++) & 0x3F) << 12;
1793			if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) chr = '?';
1794			else {
1795			chr += (((unsigned char)*str++) & 0x3F) << 6;
1796			if (((unsigned char)str) < 0x80 \|\| ((unsigned char)str) >= 0xC0) chr = '?';
1797			else chr += ((unsigned char)*str++) & 0x3F;
1798			}
1799			}
1800			} else chr = '?', ++str;
1801
1802			if (chr <= 0xFFFF) wstr.push_back(chr);
1803			else if (chr <= 0x10FFFF) {
1804			wstr.push_back(0xD800 + ((chr - 0x10000) >> 10));
1805			wstr.push_back(0xDC00 + ((chr - 0x10000) & 0x3FF));
1806			} else {
1807			wstr.push_back('?');
1808			}
1809			}
1810			return wstr;
1811			}
1812
1813			inline wstring path_from_utf8(const string& str) {
1814			return path_from_utf8(str.c_str());
1815			}
1816
1817			#else
1818
1819			inline string path_from_utf8(const char* str) {
1820	1		return str;
1821			}
1822
1823			inline const string& path_from_utf8(const string& str) {
1824			return str;
1825			}
1826
1827			#endif
1828
1829			} // namespace utils
1830
1831			/////////
1832			// File: utils/named_values.h
1833			/////////
1834
1835			// This file is part of UFAL C++ Utils .
1836			//
1837			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1838			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1839			//
1840			// This Source Code Form is subject to the terms of the Mozilla Public
1841			// License, v. 2.0. If a copy of the MPL was not distributed with this
1842			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1843
1844			namespace utils {
1845
1846			//
1847			// Declarations
1848			//
1849
1850			class named_values {
1851			public:
1852			typedef unordered_map map;
1853
1854			inline static bool parse(const string& values, map& parsed_values, string& error);
1855			};
1856
1857			//
1858			// Definitions
1859			//
1860
1861	3		bool named_values::parse(const string& values, map& parsed_values, string& error) {
1862			error.clear();
1863			parsed_values.clear();
1864
1865			string name, file;
1866	3	50	for (size_t start = 0; start < values.size(); ) {
1867	0	0	while (start < values.size() && values[start] == ';') start++;
		0
		0
1868	0	0	if (start >= values.size()) break;
1869
1870			size_t name_end = values.find_first_of("=;", start);
1871	0	0	name.assign(values, start, name_end - start);
1872			string& value = parsed_values[name];
1873
1874	0	0	if (name_end == string::npos) {
1875			start = name_end;
1876	0	0	} else if (values[name_end] == ';') {
1877	0		start = name_end + 1;
1878			} else /* if (values[name_end] == '=') */ {
1879			size_t equal_sign = name_end;
1880
1881	0	0	if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) {
		0
		0
		0
1882			// Value of type file:
1883			size_t file_name = equal_sign + 1 + 5;
1884	0		size_t semicolon = min(values.find(';', file_name), values.size());
1885
1886	0	0	file.assign(values, file_name, semicolon - file_name);
1887	0	0	ifstream is(path_from_utf8(file).c_str());
1888	0	0	if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false;
		0
		0
1889
1890			char buffer[1024];
1891	0	0	for (value.clear(); is.read(buffer, sizeof(buffer)); )
		0
1892	0	0	value.append(buffer, sizeof(buffer));
1893	0	0	value.append(buffer, is.gcount());
1894
1895	0		start = semicolon + 1;
1896	0	0	} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) {
		0
		0
		0
1897			// Value of type data:
1898			size_t data_size_start = equal_sign + 1 + 5;
1899	0		size_t data_size_end = values.find(':', data_size_start);
1900	0	0	if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false;
		0
		0
1901
1902			int data_size;
1903	0	0	if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false;
		0
1904
1905	0		size_t data_start = data_size_end + 1, data_end = data_start + data_size;
1906	0	0	if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false;
		0
		0
1907	0	0	if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false;
		0
		0
		0
		0
1908
1909	0	0	value.assign(values, data_start, data_end - data_start);
1910	0		start = data_end + 1;
1911			} else {
1912			// Value of string type
1913	0		size_t semicolon = min(values.find(';', equal_sign), values.size());
1914	0	0	value.assign(values, equal_sign + 1, semicolon - equal_sign - 1);
1915	0		start = semicolon + 1;
1916			}
1917			}
1918			}
1919
1920			return true;
1921			}
1922
1923			} // namespace utils
1924
1925			/////////
1926			// File: utils/threadsafe_stack.h
1927			/////////
1928
1929			// This file is part of UFAL C++ Utils .
1930			//
1931			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1932			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1933			//
1934			// This Source Code Form is subject to the terms of the Mozilla Public
1935			// License, v. 2.0. If a copy of the MPL was not distributed with this
1936			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1937
1938			namespace utils {
1939
1940			//
1941			// Declarations
1942			//
1943
1944			template
1945	3		class threadsafe_stack {
1946			public:
1947			inline void push(T* t);
1948			inline T* pop();
1949
1950			private:
1951			vector> stack;
1952			atomic_flag lock = ATOMIC_FLAG_INIT;
1953			};
1954
1955			//
1956			// Definitions
1957			//
1958
1959			template
1960	8		void threadsafe_stack::push(T* t) {
1961	4	0	while (lock.test_and_set(memory_order_acquire)) {}
		0
		50
		50
		50
		50
1962	4		stack.emplace_back(t);
1963			lock.clear(memory_order_release);
1964	4		}
1965
1966			template
1967	8		T* threadsafe_stack::pop() {
1968			T* res = nullptr;
1969
1970	4	0	while (lock.test_and_set(memory_order_acquire)) {}
		0
		50
		50
		50
		50
1971	4	0	if (!stack.empty()) {
		0
		50
		50
		50
		50
1972			res = stack.back().release();
1973			stack.pop_back();
1974			}
1975			lock.clear(memory_order_release);
1976
1977	4		return res;
1978			}
1979
1980			} // namespace utils
1981
1982			/////////
1983			// File: model/model_morphodita_parsito.h
1984			/////////
1985
1986			// This file is part of UDPipe .
1987			//
1988			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
1989			// Mathematics and Physics, Charles University in Prague, Czech Republic.
1990			//
1991			// This Source Code Form is subject to the terms of the Mozilla Public
1992			// License, v. 2.0. If a copy of the MPL was not distributed with this
1993			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
1994
1995	4		class model_morphodita_parsito : public model {
1996			public:
1997			virtual input_format* new_tokenizer(const string& options) const override;
1998			virtual bool tag(sentence& s, const string& options, string& error) const override;
1999			virtual bool parse(sentence& s, const string& options, string& error) const override;
2000
2001			static model* load(istream& is);
2002
2003			private:
2004			model_morphodita_parsito(unsigned version);
2005			unsigned version;
2006			enum { VERSION_LATEST = 3 };
2007
2008			unique_ptr tokenizer_factory;
2009			unique_ptr splitter;
2010	1		struct tagger_model {
2011			bool raw; bool upostag; int lemma; bool xpostag, feats;
2012			unique_ptr tagger;
2013
2014			tagger_model(bool raw, bool upostag, int lemma, bool xpostag, bool feats, morphodita::tagger* tagger)
2015	1		: raw(raw), upostag(upostag), lemma(lemma), xpostag(xpostag), feats(feats), tagger(tagger) {}
2016			};
2017			vector taggers;
2018			unique_ptr parser;
2019
2020	3		struct tagger_cache {
2021			vector forms_normalized;
2022			vector forms_string_pieces;
2023			vector lemmas;
2024			};
2025			mutable threadsafe_stack tagger_caches;
2026
2027	1	50	struct parser_cache {
2028			parsito::tree tree;
2029			named_values::map options;
2030			};
2031			mutable threadsafe_stack parser_caches;
2032
2033			bool parse(sentence& s, const string& options, string& error, double* cost) const;
2034
2035	0		class joint_with_parsing_tokenizer : public input_format {
2036			public:
2037			joint_with_parsing_tokenizer(input_format* tokenizer, const model_morphodita_parsito& model,
2038			int max_sentence_len, double change_boundary_logprob, double sentence_logprob)
2039			: tokenizer(tokenizer), model(model), max_sentence_len(max_sentence_len),
2040	0		change_boundary_logprob(change_boundary_logprob), sentence_logprob(sentence_logprob) {}
2041
2042			virtual bool read_block(istream& is, string& block) const override;
2043			virtual void reset_document(string_piece id) override;
2044			virtual void set_text(string_piece text, bool make_copy = false) override;
2045			virtual bool next_sentence(sentence& s, string& error) override;
2046
2047			private:
2048			bool parse_paragraph(vector& paragraph, string& error);
2049
2050			unique_ptr tokenizer;
2051			const model_morphodita_parsito& model;
2052			int max_sentence_len;
2053			double change_boundary_logprob;
2054			double sentence_logprob;
2055
2056			string_piece text;
2057			string text_copy;
2058			bool new_document = true;
2059			string document_id;
2060			unsigned sentence_id = 1;
2061			vector sentences;
2062			size_t sentences_index = 0;
2063			};
2064
2065			void fill_word_analysis(const morphodita::tagged_lemma& analysis, bool raw, bool upostag, int lemma, bool xpostag, bool feats, word& word) const;
2066			const string& normalize_form(string_piece form, string& output) const;
2067			const string& normalize_lemma(string_piece lemma, string& output) const;
2068			friend class trainer_morphodita_parsito;
2069			};
2070
2071			/////////
2072			// File: model/model.cpp
2073			/////////
2074
2075			// This file is part of UDPipe .
2076			//
2077			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2078			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2079			//
2080			// This Source Code Form is subject to the terms of the Mozilla Public
2081			// License, v. 2.0. If a copy of the MPL was not distributed with this
2082			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2083
2084	2		const string model::DEFAULT;
2085	2		const string model::TOKENIZER_NORMALIZED_SPACES = "normalized_spaces";
2086	2		const string model::TOKENIZER_PRESEGMENTED = "presegmented";
2087	2		const string model::TOKENIZER_RANGES = "ranges";
2088
2089	1		model* model::load(const char* fname) {
2090	2	50	ifstream in(path_from_utf8(fname).c_str(), ifstream::in \| ifstream::binary);
2091	1	50	if (!in.is_open()) return nullptr;
2092	1	50	return load(in);
2093			}
2094
2095	1		model* model::load(istream& is) {
2096			char len;
2097	1	50	if (!is.get(len)) return nullptr;
2098	1		string name(len, ' ');
2099	1	50	if (!is.read(&name[0], len)) return nullptr;
		50
2100
2101	1	50	if (name == "morphodita_parsito") return model_morphodita_parsito::load(is);
		50
2102
2103			return nullptr;
2104			}
2105
2106			/////////
2107			// File: morphodita/tagger/tagger_ids.h
2108			/////////
2109
2110			// This file is part of MorphoDiTa .
2111			//
2112			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2113			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2114			//
2115			// This Source Code Form is subject to the terms of the Mozilla Public
2116			// License, v. 2.0. If a copy of the MPL was not distributed with this
2117			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2118
2119			namespace morphodita {
2120
2121			class tagger_ids {
2122			public:
2123			enum tagger_id {
2124			CZECH2 = 0, CZECH3 = 1, CZECH2_3 = 6,
2125			/* 2 was used internally for ENGLISH3, but never released publicly */
2126			GENERIC2 = 3, GENERIC3 = 4, GENERIC4 = 5, GENERIC2_3 = 7,
2127			CONLLU2 = 8, CONLLU2_3 = 9, CONLLU3 = 10,
2128			};
2129
2130			static bool parse(const string& str, tagger_id& id) {
2131			if (str == "czech2") return id = CZECH2, true;
2132			if (str == "czech2_3") return id = CZECH2_3, true;
2133			if (str == "czech3") return id = CZECH3, true;
2134			if (str == "generic2") return id = GENERIC2, true;
2135			if (str == "generic2_3") return id = GENERIC2_3, true;
2136			if (str == "generic3") return id = GENERIC3, true;
2137			if (str == "generic4") return id = GENERIC4, true;
2138			if (str == "conllu2") return id = CONLLU2, true;
2139			if (str == "conllu2_3") return id = CONLLU2_3, true;
2140			if (str == "conllu3") return id = CONLLU3, true;
2141			return false;
2142			}
2143
2144			static int decoding_order(tagger_id id) {
2145			switch (id) {
2146			case CZECH2: return 2;
2147			case CZECH2_3: return 2;
2148			case CZECH3: return 3;
2149			case GENERIC2: return 2;
2150			case GENERIC2_3: return 2;
2151			case GENERIC3: return 3;
2152			case GENERIC4: return 4;
2153			case CONLLU2: return 2;
2154			case CONLLU2_3: return 2;
2155			case CONLLU3: return 3;
2156			}
2157			return 0;
2158			}
2159
2160			static int window_size(tagger_id id) {
2161			switch (id) {
2162			case CZECH2_3: return 3;
2163			case GENERIC2_3: return 3;
2164			case CONLLU2_3: return 3;
2165			default: break;
2166			}
2167			return decoding_order(id);
2168			}
2169			};
2170
2171			typedef tagger_ids::tagger_id tagger_id;
2172
2173			} // namespace morphodita
2174
2175			/////////
2176			// File: tokenizer/morphodita_tokenizer_wrapper.h
2177			/////////
2178
2179			// This file is part of UDPipe .
2180			//
2181			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2182			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2183			//
2184			// This Source Code Form is subject to the terms of the Mozilla Public
2185			// License, v. 2.0. If a copy of the MPL was not distributed with this
2186			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2187
2188	4		class morphodita_tokenizer_wrapper : public input_format {
2189			public:
2190			morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter, bool normalized_spaces, bool token_ranges);
2191
2192			virtual bool read_block(istream& is, string& block) const override;
2193			virtual void reset_document(string_piece id) override;
2194			virtual void set_text(string_piece text, bool make_copy = false) override;
2195			virtual bool next_sentence(sentence& s, string& error) override;
2196
2197			private:
2198			unique_ptr tokenizer;
2199			const multiword_splitter* splitter;
2200			bool normalized_spaces, token_ranges;
2201
2202			bool new_document = true;
2203			string document_id;
2204			unsigned preceeding_newlines = 2;
2205			unsigned sentence_id = 1;
2206
2207			string_piece text;
2208			string text_copy;
2209			size_t unicode_offset = 0, text_unicode_length = 0;
2210			string saved_spaces;
2211			vector forms;
2212			vector tokens;
2213			token tok;
2214			};
2215
2216			/////////
2217			// File: utils/getpara.h
2218			/////////
2219
2220			// This file is part of UFAL C++ Utils .
2221			//
2222			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2223			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2224			//
2225			// This Source Code Form is subject to the terms of the Mozilla Public
2226			// License, v. 2.0. If a copy of the MPL was not distributed with this
2227			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2228
2229			namespace utils {
2230
2231			//
2232			// Declarations
2233			//
2234
2235			// Read paragraph until EOF or end line. All encountered \n are stored.
2236			inline istream& getpara(istream& is, string& para);
2237
2238			//
2239			// Definitions
2240			//
2241
2242	0		istream& getpara(istream& is, string& para) {
2243			para.clear();
2244
2245	0	0	for (string line; getline(is, line); ) {
		0
2246			para.append(line);
2247	0	0	para.push_back('\n');
2248
2249	0	0	if (line.empty()) break;
2250			}
2251
2252	0	0	if (is.eof() && !para.empty()) is.clear(istream::eofbit);
		0
		0
2253	0		return is;
2254			}
2255
2256			} // namespace utils
2257
2258			/////////
2259			// File: utils/parse_double.h
2260			/////////
2261
2262			// This file is part of UFAL C++ Utils .
2263			//
2264			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2265			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2266			//
2267			// This Source Code Form is subject to the terms of the Mozilla Public
2268			// License, v. 2.0. If a copy of the MPL was not distributed with this
2269			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2270
2271			namespace utils {
2272
2273			//
2274			// Declarations
2275			//
2276
2277			// Try to parse an double from given string. If the double cannot be parsed or does
2278			// not fit doubleo double, false is returned and the error string is filled using the
2279			// value_name argument.
2280			inline bool parse_double(string_piece str, const char* value_name, double& value, string& error);
2281
2282			// Try to parse an double from given string. If the double cannot be parsed or does
2283			// not fit doubleo double, an error is displayed and program exits.
2284			inline double parse_double(string_piece str, const char* value_name);
2285
2286			//
2287			// Definitions
2288			//
2289
2290	0		bool parse_double(string_piece str, const char* value_name, double& value, string& error) {
2291			string_piece original = str;
2292
2293			// Skip spaces
2294	0	0	while (str.len && (str.str[0] == ' ' \|\| str.str[0] == '\f' \|\| str.str[0] == '\n' \|\| str.str[0] == '\r' \|\| str.str[0] == '\t' \|\| str.str[0] == '\v'))
		0
		0
		0
		0
		0
2295	0		str.str++, str.len--;
2296
2297			// Allow plus/minus
2298			bool negative = false;
2299	0	0	if (str.len && (str.str[0] == '+' \|\| str.str[0] == '-')) {
		0
2300			negative = str.str[0] == '-';
2301	0		str.str++, str.len--;
2302			}
2303
2304			// Parse value, checking for overflow/underflow
2305	0	0	if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false;
2306			if (!(str.str[0] >= '0' \|\| str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false;
2307
2308	0		value = 0;
2309	0	0	while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
		0
		0
2310	0		value = 10 * value + (str.str[0] - '0');
2311	0		str.str++, str.len--;
2312			}
2313
2314			// If there is a decimal point, parse the rest of the
2315	0	0	if (str.len && str.str[0] == '.') {
		0
2316			double divider = 1;
2317
2318	0		str.str++, str.len--;
2319	0	0	while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
		0
		0
2320	0		value = 10 * value + (str.str[0] - '0');
2321	0		divider *= 10.;
2322	0		str.str++, str.len--;
2323			}
2324
2325	0		value /= divider;
2326			}
2327	0	0	if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false;
2328
2329			// Optionally parse an exponent
2330	0	0	if (str.len && (str.str[0] == 'e' \|\| str.str[0] == 'E')) {
		0
2331	0		str.str++, str.len--;
2332
2333			double exponent = 0;
2334			bool exponent_negative = false;
2335	0	0	if (str.len && (str.str[0] == '+' \|\| str.str[0] == '-')) {
		0
2336			exponent_negative = str.str[0] == '-';
2337	0		str.str++, str.len--;
2338			}
2339
2340	0	0	while (str.len && str.str[0] >= '0' && str.str[0] <= '9') {
		0
		0
2341	0		exponent = 10 * exponent + (str.str[0] - '0');
2342	0		str.str++, str.len--;
2343			}
2344
2345	0	0	exponent = pow(10., exponent_negative ? -exponent : exponent);
2346	0	0	if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false;
2347	0	0	if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false;
2348
2349	0	0	if (value) {
2350	0		value *= exponent;
2351	0	0	if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false;
2352	0	0	if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false;
2353			}
2354			}
2355
2356			// Apply initial minus
2357	0	0	if (negative) value *= -1;
2358
2359			// Skip spaces
2360	0	0	while (str.len && (str.str[0] == ' ' \|\| str.str[0] == '\f' \|\| str.str[0] == '\n' \|\| str.str[0] == '\r' \|\| str.str[0] == '\t' \|\| str.str[0] == '\v'))
		0
		0
		0
		0
		0
2361	0		str.str++, str.len--;
2362
2363			// Check for remaining characters
2364	0	0	if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false;
2365
2366			return true;
2367			}
2368
2369	0		double parse_double(string_piece str, const char* value_name) {
2370			double result;
2371			string error;
2372
2373	0	0	if (!parse_double(str, value_name, result, error))
		0
2374	0		runtime_failure(error);
2375
2376	0		return result;
2377			}
2378
2379			} // namespace utils
2380
2381			/////////
2382			// File: model/model_morphodita_parsito.cpp
2383			/////////
2384
2385			// This file is part of UDPipe .
2386			//
2387			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2388			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2389			//
2390			// This Source Code Form is subject to the terms of the Mozilla Public
2391			// License, v. 2.0. If a copy of the MPL was not distributed with this
2392			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2393
2394			// Versions:
2395			// 1 - initial version
2396			// 2 - add absolute lemmas (tagger_model::lemma == 2)
2397			// - use Arabic and space normalization
2398
2399	1		input_format* model_morphodita_parsito::new_tokenizer(const string& options) const {
2400	1	50	if (!tokenizer_factory)
2401			return nullptr;
2402
2403			named_values::map parsed_options;
2404			string parse_error;
2405	1	50	if (!named_values::parse(options, parsed_options, parse_error))
		50
2406			return nullptr;
2407
2408	1	50	bool normalized_spaces = parsed_options.count("normalized_spaces");
2409	1	50	bool token_ranges = parsed_options.count("ranges");
2410
2411	1	50	const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr;
		50
2412	1	50	unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges));
		50
		50
2413
2414			// Presegmented
2415	3	0	if (parsed_options.count("presegmented") && result)
		50
2416	0	0	result.reset(input_format::new_presegmented_tokenizer(result.release()));
2417
2418			// Joint with parsing
2419	3	0	if (parsed_options.count("joint_with_parsing") && result) {
		50
2420	0		int max_sentence_len = 20;
2421	0	0	if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error))
		0
		0
		0
		0
		0
		0
2422	0		return nullptr;
2423
2424	0		double change_boundary_logprob = -0.5;
2425	0	0	if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error))
		0
		0
		0
		0
		0
		0
2426			return nullptr;
2427
2428	0		double sentence_logprob = -0.5;
2429	0	0	if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error))
		0
		0
		0
		0
		0
		0
2430			return nullptr;
2431
2432	0	0	result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob));
2433			}
2434
2435	1		return result.release();
2436			}
2437
2438	1		bool model_morphodita_parsito::tag(sentence& s, const string& /options/, string& error) const {
2439			error.clear();
2440
2441	1	50	if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false;
2442	1	50	if (s.empty()) return true;
2443
2444	1		tagger_cache* c = tagger_caches.pop();
2445	1	50	if (!c) c = new tagger_cache();
2446
2447			// Prepare input forms
2448	1		c->forms_normalized.resize(s.words.size() - 1);
2449	1		c->forms_string_pieces.resize(s.words.size() - 1);
2450	8	100	for (size_t i = 1; i < s.words.size(); i++)
2451	7		c->forms_string_pieces[i - 1] = normalize_form(s.words[i].form, c->forms_normalized[i - 1]);
2452
2453			// Clear first
2454	8	100	for (size_t i = 1; i < s.words.size(); i++) {
2455	7		s.words[i].lemma.assign("_");
2456			s.words[i].upostag.clear();
2457			s.words[i].xpostag.clear();
2458			s.words[i].feats.clear();
2459			}
2460
2461			// Fill information from the tagger models
2462	2	100	for (auto&& tagger : taggers) {
2463	1	50	if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false;
2464
2465	1		tagger.tagger->tag(c->forms_string_pieces, c->lemmas);
2466
2467	8	100	for (size_t i = 0; i < c->lemmas.size(); i++)
2468	7		fill_word_analysis(c->lemmas[i], tagger.raw, tagger.upostag, tagger.lemma, tagger.xpostag, tagger.feats, s.words[i+1]);
2469			}
2470
2471			// For raw tagger models, fill MorphoGuesser=Yes where appropriate
2472	1	50	if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) {
		50
		0
		50
2473	0		const auto* morpho = taggers[0].tagger->get_morpho();
2474	0	0	for (size_t i = 0; i < c->forms_string_pieces.size(); i++) {
2475	0	0	if (morpho->analyze(c->forms_string_pieces[i], morphodita::morpho::GUESSER, c->lemmas) == morphodita::morpho::GUESSER)
2476	0	0	s.words[i + 1].misc.append(s.words[i + 1].misc.empty() ? "" : "\|").append("MorphoGuesser=Yes");
2477			}
2478			}
2479
2480	1		tagger_caches.push(c);
2481	1		return true;
2482			}
2483
2484	1		bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error) const {
2485	1		return parse(s, options, error, nullptr);
2486			}
2487
2488	1		bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error, double* cost) const {
2489			error.clear();
2490
2491	1	50	if (!parser) return error.assign("No parser defined for the UDPipe model!"), false;
2492	1	50	if (s.empty()) return true;
2493
2494	1		parser_cache* c = parser_caches.pop();
2495	1	50	if (!c) c = new parser_cache();
2496
2497	1		int beam_search = 5;
2498	1	50	if (!named_values::parse(options, c->options, error))
2499			return false;
2500	2	50	if (c->options.count("beam_search"))
2501	0	0	if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error))
		0
2502			return false;
2503
2504	1		c->tree.clear();
2505	8	100	for (size_t i = 1; i < s.words.size(); i++) {
2506	7		c->tree.add_node(string());
2507	7		normalize_form(s.words[i].form, c->tree.nodes.back().form);
2508	7		normalize_lemma(s.words[i].lemma, c->tree.nodes.back().lemma);
2509	14		c->tree.nodes.back().upostag.assign(s.words[i].upostag);
2510	14		c->tree.nodes.back().xpostag.assign(s.words[i].xpostag);
2511	14		c->tree.nodes.back().feats.assign(s.words[i].feats);
2512	14		c->tree.nodes.back().deps.assign(s.words[i].deps);
2513	14		c->tree.nodes.back().misc.assign(s.words[i].misc);
2514			}
2515
2516	1		parser->parse(c->tree, beam_search, cost);
2517	8	100	for (size_t i = 1; i < s.words.size(); i++)
2518	7		s.set_head(i, c->tree.nodes[i].head, c->tree.nodes[i].deprel);
2519
2520	1		parser_caches.push(c);
2521			return true;
2522			}
2523
2524	1		model* model_morphodita_parsito::load(istream& is) {
2525			char version;
2526	1	50	if (!is.get(version)) return nullptr;
2527	1	50	if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr;
2528
2529			// Because UDPipe 1.0 does not check the model version,
2530			// a specific sentinel was added since version 2 so that
2531			// loading of such model fail on UDPipe 1.0
2532	1	50	if (version >= 2) {
2533			char sentinel;
2534	0	0	if (!is.get(sentinel) \|\| sentinel != 0x7F) return nullptr;
		0
		0
2535	0	0	if (!is.get(sentinel) \|\| sentinel != 0x7F) return nullptr;
		0
		0
2536			}
2537
2538	1		unique_ptr m(new model_morphodita_parsito((unsigned char)version));
2539	1	50	if (!m) return nullptr;
2540
2541			char tokenizer;
2542	1	50	if (!is.get(tokenizer)) return nullptr;
		50
2543	1	50	m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr);
		50
2544	1	50	if (tokenizer && !m->tokenizer_factory) return nullptr;
		50
		50
2545	1	50	m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr);
		50
2546	1	50	if (tokenizer && !m->splitter) return nullptr;
		50
		50
2547
2548	1		m->taggers.clear();
2549	1	50	char taggers; if (!is.get(taggers)) return nullptr;
		50
2550	2	100	for (char i = 0; i < taggers; i++) {
2551	1	50	char lemma; if (!is.get(lemma)) return nullptr;
		50
2552	1	50	char xpostag; if (!is.get(xpostag)) return nullptr;
		50
2553	1	50	char feats; if (!is.get(feats)) return nullptr;
		50
2554	1	50	int model_type = is.peek();
2555	1		bool raw = !(model_type == morphodita::tagger_ids::CONLLU2 \|\|
2556			model_type == morphodita::tagger_ids::CONLLU2_3 \|\|
2557	1	50	model_type == morphodita::tagger_ids::CONLLU3);
		50
2558	1	50	morphodita::tagger* tagger = morphodita::tagger::load(is);
2559	1	50	if (!tagger) return nullptr;
2560	1	50	m->taggers.emplace_back(raw, i == 0, int(lemma), bool(xpostag), bool(feats), tagger);
2561			}
2562
2563			char parser;
2564	1	50	if (!is.get(parser)) return nullptr;
		50
2565	1	50	m->parser.reset(parser ? parsito::parser::load(is) : nullptr);
		50
2566	1	50	if (parser && !m->parser) return nullptr;
		50
		50
2567
2568	1		return m.release();
2569			}
2570
2571	0		model_morphodita_parsito::model_morphodita_parsito(unsigned version) : version(version) {}
2572
2573	0		bool model_morphodita_parsito::joint_with_parsing_tokenizer::read_block(istream& is, string& block) const {
2574			block.clear();
2575
2576	0	0	for (string line; getline(is, line); ) {
		0
2577			block.append(line);
2578	0	0	block.push_back('\n');
2579			}
2580
2581	0	0	if (is.eof() && !block.empty()) is.clear(istream::eofbit);
		0
		0
2582	0		return bool(is);
2583			}
2584
2585	0		void model_morphodita_parsito::joint_with_parsing_tokenizer::reset_document(string_piece id) {
2586	0		new_document = true;
2587	0		document_id.assign(id.str, id.len);
2588	0		sentence_id = 1;
2589	0		set_text("");
2590			sentences.clear();
2591	0		sentences_index = 0;
2592	0		}
2593
2594	0		void model_morphodita_parsito::joint_with_parsing_tokenizer::set_text(string_piece text, bool make_copy) {
2595	0	0	if (make_copy) {
2596	0		text_copy.assign(text.str, text.len);
2597			text.str = text_copy.c_str();
2598			}
2599	0		this->text = text;
2600	0		}
2601
2602	0		bool model_morphodita_parsito::joint_with_parsing_tokenizer::next_sentence(sentence& s, string& error) {
2603			error.clear();
2604
2605	0	0	if (text.len) {
2606			sentences.clear();
2607	0		sentences_index = 0;
2608
2609	0		tokenizer->set_text(text, false);
2610
2611	0		sentence input;
2612	0		vector paragraph;
2613	0	0	while (tokenizer->next_sentence(input, error)) {
		0
2614	0	0	if (input.get_new_par() && !paragraph.empty()) {
		0
		0
		0
2615	0	0	if (!parse_paragraph(paragraph, error)) return false;
		0
2616	0	0	for (auto&& sentence : paragraph)
2617	0	0	sentences.push_back(sentence);
2618			paragraph.clear();
2619			}
2620	0	0	paragraph.push_back(input);
2621			}
2622	0	0	if (!error.empty()) return false;
2623
2624	0	0	if (!paragraph.empty()) {
2625	0	0	if (!parse_paragraph(paragraph, error)) return false;
		0
2626	0	0	for (auto&& sentence : paragraph)
2627	0	0	sentences.push_back(sentence);
2628			}
2629
2630	0		text.len = 0;
2631			}
2632
2633	0	0	if (sentences_index < sentences.size()) {
2634	0		s = sentences[sentences_index++];
2635	0		return true;
2636			}
2637
2638			return false;
2639			}
2640
2641	0		bool model_morphodita_parsito::joint_with_parsing_tokenizer::parse_paragraph(vector& paragraph, string& error) {
2642	0		sentence all_words;
2643	0	0	vector sentence_boundary(1, true);
2644	0	0	vector token_boundary(1, true);
2645
2646	0	0	for (auto&& s : paragraph) {
2647	0		unsigned offset = all_words.words.size() - 1;
2648	0	0	for (unsigned i = 1; i < s.words.size(); i++) {
2649	0	0	all_words.words.push_back(s.words[i]);
2650	0		all_words.words.back().id += offset;
2651	0	0	sentence_boundary.push_back(i+1 == s.words.size());
2652	0	0	token_boundary.push_back(true);
2653			}
2654
2655	0	0	for (auto&& mwt : s.multiword_tokens) {
2656	0	0	all_words.multiword_tokens.push_back(mwt);
2657	0		all_words.multiword_tokens.back().id_first += offset;
2658	0		all_words.multiword_tokens.back().id_last += offset;
2659	0	0	for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++)
2660	0		token_boundary[i] = false;
2661			}
2662			}
2663
2664	0	0	vector best_logprob(all_words.words.size(), -numeric_limits::infinity()); best_logprob[0] = 0.;
2665	0	0	vector best_length(all_words.words.size(), 0);
2666	0	0	sentence s;
2667
2668	0	0	for (unsigned start = 1; start < all_words.words.size(); start++) {
2669	0	0	if (!token_boundary[start - 1]) continue;
2670	0	0	s.clear();
2671	0	0	for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) {
		0
		0
2672	0	0	s.words.push_back(all_words.words[end - 1]);
2673	0		s.words.back().id -= start - 1;
2674	0	0	if (!token_boundary[end - 1]) continue;
2675
2676	0	0	for (unsigned i = 1; i < s.words.size(); i++) {
2677	0		s.words[i].head = -1;
2678			s.words[i].children.clear();
2679			}
2680
2681			double cost;
2682	0	0	if (!model.parse(s, DEFAULT, error, &cost)) return false;
		0
2683	0		cost += sentence_logprob + change_boundary_logprob * (2 - int(sentence_boundary[start - 1]) - int(sentence_boundary[end - 1]));
2684	0	0	if (best_logprob[start - 1] + cost > best_logprob[end - 1]) {
2685	0		best_logprob[end - 1] = best_logprob[start - 1] + cost;
2686	0		best_length[end - 1] = end - start;
2687			}
2688			}
2689			}
2690
2691			vector sentence_lengths;
2692	0	0	for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1])
2693	0	0	sentence_lengths.push_back(best_length[end - 1]);
2694
2695			paragraph.clear();
2696
2697	0		sentence_lengths.push_back(1);
2698			reverse(sentence_lengths.begin(), sentence_lengths.end());
2699	0	0	for (unsigned i = 1; i < sentence_lengths.size(); i++) {
2700	0		sentence_lengths[i] += sentence_lengths[i - 1];
2701
2702	0	0	paragraph.emplace_back();
2703	0	0	while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) {
		0
		0
2704	0	0	paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front());
2705	0		paragraph.back().multiword_tokens.back().id_first -= sentence_lengths[i-1] - 1;
2706	0		paragraph.back().multiword_tokens.back().id_last -= sentence_lengths[i-1] - 1;
2707			all_words.multiword_tokens.erase(all_words.multiword_tokens.begin());
2708			}
2709
2710	0	0	for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) {
2711	0	0	paragraph.back().words.push_back(all_words.words[word]);
2712	0		paragraph.back().words.back().id -= sentence_lengths[i-1] - 1;
2713	0		paragraph.back().words.back().head = -1;
2714			paragraph.back().words.back().children.clear();
2715			}
2716			}
2717
2718	0	0	if (!paragraph.empty()) {
2719	0	0	if (new_document) {
2720	0	0	paragraph.front().set_new_doc(true, document_id);
2721	0		new_document = false;
2722			}
2723
2724	0	0	paragraph.front().set_new_par(true);
2725			}
2726
2727			return true;
2728			}
2729
2730	7		void model_morphodita_parsito::fill_word_analysis(const morphodita::tagged_lemma& analysis, bool raw, bool upostag, int lemma, bool xpostag, bool feats, word& word) const {
2731			// Handle raw MorphoDiTa models.
2732	7	50	if (raw) {
2733	0	0	if (lemma) word.lemma.assign(analysis.lemma);
2734	0	0	if (xpostag) word.xpostag.assign(analysis.tag);
2735			return;
2736			}
2737
2738			// Lemma
2739	7	50	if (lemma == 1) {
2740	7		word.lemma.assign(analysis.lemma);
2741	0	0	} else if (lemma == 2) {
2742	0		word.lemma.assign(analysis.lemma);
2743
2744			// Lemma matching ~replacement~normalized_form is changed to replacement.
2745	0	0	if (analysis.lemma[0] == '~') {
2746	0		auto end = analysis.lemma.find('~', 1);
2747	0	0	if (end != string::npos) {
2748	0		normalize_form(word.form, word.lemma);
2749	0	0	if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0)
2750	0		word.lemma.assign(analysis.lemma, 1, end - 1);
2751			else
2752			word.lemma.assign(analysis.lemma);
2753			}
2754			}
2755			}
2756	7	50	if (version == 2) {
2757			// Replace '\001' back to spaces
2758	0	0	for (auto && chr : word.lemma)
2759	0	0	if (chr == '\001')
2760	0		chr = ' ';
2761	7	50	} else if (version >= 3) {
2762			// Replace '0xC2 0xA0' back to spaces
2763	0	0	for (size_t i = 0; i + 1 < word.lemma.size(); i++)
2764	0	0	if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0))
		0
		0
2765	0		word.lemma.replace(i, 2, 1, ' ');
2766			}
2767
2768	7	50	if (!upostag && !xpostag && !feats) return;
		0
2769
2770			// UPOSTag
2771	7		char separator = analysis.tag[0];
2772	7		size_t start = min(size_t(1), analysis.tag.size()), end = min(analysis.tag.find(separator, 1), analysis.tag.size());
2773	7	50	if (upostag) word.upostag.assign(analysis.tag, start, end - start);
2774
2775	7	50	if (!xpostag && !feats) return;
2776
2777			// XPOSTag
2778	14		start = min(end + 1, analysis.tag.size());
2779	7		end = min(analysis.tag.find(separator, start), analysis.tag.size());
2780	7	50	if (xpostag) word.xpostag.assign(analysis.tag, start, end - start);
2781
2782	7	50	if (!feats) return;
2783
2784			// Features
2785	14		start = min(end + 1, analysis.tag.size());
2786	7		word.feats.assign(analysis.tag, start, analysis.tag.size() - start);
2787			}
2788
2789	14		const string& model_morphodita_parsito::normalize_form(string_piece form, string& output) const {
2790			using unilib::utf8;
2791
2792			// No normalization on version 1
2793	28	50	if (version <= 1) return output.assign(form.str, form.len);
2794
2795			// If requested, replace space by \001 in version 2 and by (\u00a0) since version 3
2796
2797			// Arabic normalization since version 2, implementation resulted from
2798			// discussion with Otakar Smrz and Nasrin Taghizadeh.
2799			// 1. Remove https://codepoints.net/U+0640 without any reasonable doubt :)
2800			// 2. Remove https://codepoints.net/U+0652
2801			// 3. Remove https://codepoints.net/U+0670
2802			// 4. Remove everything from https://codepoints.net/U+0653 to
2803			// https://codepoints.net/U+0657 though they are probably very rare in date
2804			// 5. Remove everything from https://codepoints.net/U+064B to
2805			// https://codepoints.net/U+0650
2806			// 6. Remove https://codepoints.net/U+0651
2807			// 7. Replace https://codepoints.net/U+0671 with https://codepoints.net/U+0627
2808			// 8. Replace https://codepoints.net/U+0622 with https://codepoints.net/U+0627
2809			// 9. Replace https://codepoints.net/U+0623 with https://codepoints.net/U+0627
2810			// 10. Replace https://codepoints.net/U+0625 with https://codepoints.net/U+0627
2811			// 11. Replace https://codepoints.net/U+0624 with https://codepoints.net/U+0648
2812			// 12. Replace https://codepoints.net/U+0626 with https://codepoints.net/U+064A
2813			// One might also consider replacing some Farsi characters that might be typed
2814			// unintentionally (by Iranians writing Arabic language texts):
2815			// 13. Replace https://codepoints.net/U+06CC with https://codepoints.net/U+064A
2816			// 14. Replace https://codepoints.net/U+06A9 with https://codepoints.net/U+0643
2817			// 15. Replace https://codepoints.net/U+06AA with https://codepoints.net/U+0643
2818			//
2819			// Not implemented:
2820			// There is additional challenge with data coming from Egypt (such as printed
2821			// or online newspapers), where the word-final https://codepoints.net/U+064A
2822			// may be switched for https://codepoints.net/U+0649 and visa versa. Also, the
2823			// word-final https://codepoints.net/U+0647 could actually represent https://
2824			// codepoints.net/U+0629. You can experiment with the following replacements,
2825			// but I would rather apply them only after classifying the whole document as
2826			// following such convention:
2827			// 1. Replace https://codepoints.net/U+0629 with https://codepoints.net/U+0647
2828			// (frequent femine ending markers would appear like a third-person
2829			// masculine pronoun clitic instead)
2830			// 2. Replace https://codepoints.net/U+0649 with https://codepoints.net/U+064A
2831			// (some "weak" words would become even more ambiguous or appear as if
2832			// with a first-person pronoun clitic)
2833
2834			output.clear();
2835	0	0	for (auto&& chr : utf8::decoder(form.str, form.len)) {
2836			// Arabic normalization
2837	0	0	if (chr == 0x640 \|\| (chr >= 0x64B && chr <= 0x657) \|\| chr == 0x670) {}
		0
2838	0	0	else if (chr == 0x622) utf8::append(output, 0x627);
2839	0	0	else if (chr == 0x623) utf8::append(output, 0x627);
2840	0	0	else if (chr == 0x624) utf8::append(output, 0x648);
2841	0	0	else if (chr == 0x625) utf8::append(output, 0x627);
2842	0	0	else if (chr == 0x626) utf8::append(output, 0x64A);
2843	0	0	else if (chr == 0x671) utf8::append(output, 0x627);
2844	0	0	else if (chr == 0x6A9) utf8::append(output, 0x643);
2845	0	0	else if (chr == 0x6AA) utf8::append(output, 0x643);
2846	0	0	else if (chr == 0x6CC) utf8::append(output, 0x64A);
2847			// Space normalization
2848	0	0	else if (chr == ' ' && version == 2) utf8::append(output, 0x01);
		0
2849	0	0	else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0);
		0
2850			// Default
2851	0		else utf8::append(output, chr);
2852			}
2853
2854			// Make sure we do not remove everything
2855	0	0	if (output.empty() && form.len)
		0
		0
2856	0		utf8::append(output, utf8::first(form.str, form.len));
2857
2858			return output;
2859			}
2860
2861	7		const string& model_morphodita_parsito::normalize_lemma(string_piece lemma, string& output) const {
2862			using unilib::utf8;
2863
2864			// No normalization on version 1 and 2
2865	14	50	if (version <= 2) return output.assign(lemma.str, lemma.len);
2866
2867			// Normalize spaces by since version 3
2868			output.clear();
2869	0	0	for (size_t i = 0; i < lemma.len; i++) {
2870			// Space normalization
2871	0	0	if (lemma.str[i] == ' ') utf8::append(output, 0xA0);
2872			// Default
2873	0		else output.push_back(lemma.str[i]);
2874			}
2875
2876			return output;
2877			}
2878
2879			/////////
2880			// File: model/pipeline.h
2881			/////////
2882
2883			// This file is part of UDPipe .
2884			//
2885			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2886			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2887			//
2888			// This Source Code Form is subject to the terms of the Mozilla Public
2889			// License, v. 2.0. If a copy of the MPL was not distributed with this
2890			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2891
2892			class pipeline {
2893			public:
2894			pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output);
2895
2896			void set_model(const model* m);
2897			void set_input(const string& input);
2898			void set_tagger(const string& tagger);
2899			void set_parser(const string& parser);
2900			void set_output(const string& output);
2901
2902			void set_immediate(bool immediate);
2903			void set_document_id(const string& document_id);
2904
2905			bool process(istream& is, ostream& os, string& error) const;
2906
2907			static const string DEFAULT;
2908			static const string NONE;
2909
2910			private:
2911			const model* m;
2912			string input, tokenizer, tagger, parser, output;
2913			string document_id;
2914			bool immediate;
2915			};
2916
2917			/////////
2918			// File: sentence/output_format.h
2919			/////////
2920
2921			// This file is part of UDPipe .
2922			//
2923			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2924			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2925			//
2926			// This Source Code Form is subject to the terms of the Mozilla Public
2927			// License, v. 2.0. If a copy of the MPL was not distributed with this
2928			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2929
2930	1		class output_format {
2931			public:
2932	1		virtual ~output_format() {}
2933
2934			virtual void write_sentence(const sentence& s, ostream& os) = 0;
2935	0		virtual void finish_document(ostream& /os/) {}
2936
2937			// Static factory methods
2938			static output_format* new_output_format(const string& name);
2939			static output_format* new_conllu_output_format(const string& options = string());
2940			static output_format* new_epe_output_format(const string& options = string());
2941			static output_format* new_matxin_output_format(const string& options = string());
2942			static output_format* new_horizontal_output_format(const string& options = string());
2943			static output_format* new_plaintext_output_format(const string& options = string());
2944			static output_format* new_vertical_output_format(const string& options = string());
2945
2946			static const string CONLLU_V1;
2947			static const string CONLLU_V2;
2948			static const string HORIZONTAL_PARAGRAPHS;
2949			static const string PLAINTEXT_NORMALIZED_SPACES;
2950			static const string VERTICAL_PARAGRAPHS;
2951			};
2952
2953			/////////
2954			// File: utils/getwhole.h
2955			/////////
2956
2957			// This file is part of UFAL C++ Utils .
2958			//
2959			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2960			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2961			//
2962			// This Source Code Form is subject to the terms of the Mozilla Public
2963			// License, v. 2.0. If a copy of the MPL was not distributed with this
2964			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
2965
2966			namespace utils {
2967
2968			//
2969			// Declarations
2970			//
2971
2972			// Read whole content until EOF. All encountered \n are stored.
2973			inline istream& getwhole(istream& is, string& whole);
2974
2975			//
2976			// Definitions
2977			//
2978
2979	0		istream& getwhole(istream& is, string& whole) {
2980			whole.clear();
2981
2982	0	0	for (string line; getline(is, line); )
		0
2983	0	0	whole.append(line).push_back('\n');
2984
2985	0	0	if (is.eof() && !whole.empty()) is.clear(istream::eofbit);
		0
		0
2986	0		return is;
2987			}
2988
2989			} // namespace utils
2990
2991			/////////
2992			// File: model/pipeline.cpp
2993			/////////
2994
2995			// This file is part of UDPipe .
2996			//
2997			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
2998			// Mathematics and Physics, Charles University in Prague, Czech Republic.
2999			//
3000			// This Source Code Form is subject to the terms of the Mozilla Public
3001			// License, v. 2.0. If a copy of the MPL was not distributed with this
3002			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3003
3004	2		const string pipeline::DEFAULT;
3005	2		const string pipeline::NONE = "none";
3006
3007	0		pipeline::pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output) : immediate(false) {
3008			set_model(m);
3009	0	0	set_input(input);
3010			set_tagger(tagger);
3011			set_parser(parser);
3012	0	0	set_output(output);
3013	0		}
3014
3015	0		void pipeline::set_model(const model* m) {
3016	0		this->m = m;
3017	0		}
3018
3019	0		void pipeline::set_input(const string& input) {
3020			tokenizer.clear();
3021
3022	0	0	if (input.empty()) {
3023	0		this->input = "conllu";
3024	0	0	} else if (input == "tokenize" \|\| input == "tokenizer") {
3025	0		this->input = "tokenizer";
3026	0	0	} else if (input.compare(0, 10, "tokenizer=") == 0) {
3027	0		this->input = "tokenizer";
3028	0		tokenizer.assign(input, 10, string::npos);
3029			} else {
3030	0		this->input = input;
3031			}
3032	0		}
3033
3034	0		void pipeline::set_tagger(const string& tagger) {
3035	0		this->tagger = tagger;
3036	0		}
3037
3038	0		void pipeline::set_parser(const string& parser) {
3039	0		this->parser = parser;
3040	0		}
3041
3042	0		void pipeline::set_output(const string& output) {
3043	0	0	this->output = output.empty() ? "conllu" : output;
3044	0		}
3045
3046	0		void pipeline::set_immediate(bool immediate) {
3047	0		this->immediate = immediate;
3048	0		}
3049
3050	0		void pipeline::set_document_id(const string& document_id) {
3051	0		this->document_id = document_id;
3052	0		}
3053
3054	0		bool pipeline::process(istream& is, ostream& os, string& error) const {
3055			error.clear();
3056
3057	0		sentence s;
3058
3059			unique_ptr reader;
3060	0	0	if (input == "tokenizer") {
3061	0	0	reader.reset(m->new_tokenizer(tokenizer));
3062	0	0	if (!reader) return error.assign("The model does not have a tokenizer!"), false;
		0
3063			} else {
3064	0	0	reader.reset(input_format::new_input_format(input));
3065	0	0	if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false;
		0
		0
3066			}
3067	0	0	reader->reset_document(document_id);
3068
3069	0	0	unique_ptr writer(output_format::new_output_format(output));
3070	0	0	if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false;
		0
		0
3071
3072			string block;
3073	0	0	while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) {
		0
		0
		0
3074	0	0	reader->set_text(block);
3075	0	0	while (reader->next_sentence(s, error)) {
		0
3076	0	0	if (tagger != NONE)
3077	0	0	if (!m->tag(s, tagger, error))
		0
3078			return false;
3079
3080	0	0	if (parser != NONE)
3081	0	0	if (!m->parse(s, parser, error))
		0
3082			return false;
3083
3084	0	0	writer->write_sentence(s, os);
3085			}
3086	0	0	if (!error.empty()) return false;
3087			}
3088	0	0	writer->finish_document(os);
3089
3090			return true;
3091			}
3092
3093			/////////
3094			// File: morphodita/tagset_converter/tagset_converter.h
3095			/////////
3096
3097			// This file is part of MorphoDiTa .
3098			//
3099			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3100			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3101			//
3102			// This Source Code Form is subject to the terms of the Mozilla Public
3103			// License, v. 2.0. If a copy of the MPL was not distributed with this
3104			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3105
3106			namespace morphodita {
3107
3108	0		class tagset_converter {
3109			public:
3110	0		virtual ~tagset_converter() {}
3111
3112			// Convert a tag-lemma pair to a different tag set.
3113			virtual void convert(tagged_lemma& tagged_lemma) const = 0;
3114			// Convert a result of analysis to a different tag set. Apart from calling
3115			// convert, any repeated entry is removed.
3116			virtual void convert_analyzed(vector& tagged_lemmas) const = 0;
3117			// Convert a result of generation to a different tag set. Apart from calling
3118			// convert, any repeated entry is removed.
3119			virtual void convert_generated(vector& forms) const = 0;
3120
3121			// Static factory methods
3122			static tagset_converter* new_identity_converter();
3123
3124			static tagset_converter* new_pdt_to_conll2009_converter();
3125			static tagset_converter* new_strip_lemma_comment_converter(const morpho& dictionary);
3126			static tagset_converter* new_strip_lemma_id_converter(const morpho& dictionary);
3127			};
3128
3129			// Helper method for creating tagset_converter from instance name.
3130			tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary);
3131
3132			// Helper methods making sure remapped results are unique.
3133			void tagset_converter_unique_analyzed(vector& tagged_lemmas);
3134			void tagset_converter_unique_generated(vector& forms);
3135
3136			} // namespace morphodita
3137
3138			/////////
3139			// File: morphodita/derivator/derivation_formatter.h
3140			/////////
3141
3142			// This file is part of MorphoDiTa .
3143			//
3144			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3145			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3146			//
3147			// This Source Code Form is subject to the terms of the Mozilla Public
3148			// License, v. 2.0. If a copy of the MPL was not distributed with this
3149			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3150
3151			namespace morphodita {
3152
3153	0		class derivation_formatter {
3154			public:
3155	0		virtual ~derivation_formatter() {}
3156
3157			// Perform the required derivation and store it directly in the lemma.
3158			virtual void format_derivation(string& lemma) const;
3159
3160			// Perform the required derivation and store it directly in the tagged_lemma.
3161			// If a tagset_converter is given, it is also applied.
3162			virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter = nullptr) const = 0;
3163
3164			// Perform the required derivation on a list of tagged_lemmas.
3165			// If a tagset_converter is given, it is also applied.
3166			// Either way, only unique entries are returned.
3167			virtual void format_tagged_lemmas(vector& lemmas, const tagset_converter* converter = nullptr) const;
3168
3169			// Static factory methods.
3170			static derivation_formatter* new_none_derivation_formatter();
3171			static derivation_formatter* new_root_derivation_formatter(const derivator* derinet);
3172			static derivation_formatter* new_path_derivation_formatter(const derivator* derinet);
3173			static derivation_formatter* new_tree_derivation_formatter(const derivator* derinet);
3174			// String version of static factory method.
3175			static derivation_formatter* new_derivation_formatter(string_piece name, const derivator* derinet);
3176			};
3177
3178			} // namespace morphodita
3179
3180			/////////
3181			// File: morphodita/derivator/derivation_formatter.cpp
3182			/////////
3183
3184			// This file is part of MorphoDiTa .
3185			//
3186			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3187			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3188			//
3189			// This Source Code Form is subject to the terms of the Mozilla Public
3190			// License, v. 2.0. If a copy of the MPL was not distributed with this
3191			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3192
3193			namespace morphodita {
3194
3195	0		void derivation_formatter::format_derivation(string& lemma) const {
3196	0		tagged_lemma result;
3197	0		result.lemma.swap(lemma);
3198	0	0	format_tagged_lemma(result);
3199	0		lemma.swap(result.lemma);
3200	0		}
3201
3202	0		void derivation_formatter::format_tagged_lemmas(vector& lemmas, const tagset_converter* converter) const {
3203	0	0	for (auto&& lemma : lemmas)
3204	0		format_tagged_lemma(lemma, converter);
3205
3206	0	0	if (lemmas.size() > 1)
3207	0		tagset_converter_unique_analyzed(lemmas);
3208	0		}
3209
3210	0		class none_derivation_formatter : public derivation_formatter {
3211	0		virtual void format_derivation(string& /lemma/) const override {}
3212
3213	0		virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override {
3214	0	0	if (converter) converter->convert(lemma);
3215	0		}
3216
3217	0		virtual void format_tagged_lemmas(vector& lemmas, const tagset_converter* converter) const override {
3218	0	0	if (converter) converter->convert_analyzed(lemmas);
3219	0		}
3220			};
3221
3222	0		derivation_formatter* derivation_formatter::new_none_derivation_formatter() {
3223	0		return new none_derivation_formatter();
3224			}
3225
3226	0		class root_derivation_formatter : public derivation_formatter {
3227			public:
3228	0		root_derivation_formatter(const derivator* derinet) : derinet(derinet) {}
3229
3230	0		virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override {
3231	0	0	for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); )
		0
3232	0		lemma.lemma.assign(parent.lemma);
3233	0	0	if (converter) converter->convert(lemma);
3234	0		}
3235
3236			private:
3237			const derivator* derinet;
3238			};
3239
3240	0		derivation_formatter* derivation_formatter::new_root_derivation_formatter(const derivator* derinet) {
3241	0	0	return derinet ? new root_derivation_formatter(derinet) : nullptr;
		0
3242			}
3243
3244	0		class path_derivation_formatter : public derivation_formatter {
3245			public:
3246	0		path_derivation_formatter(const derivator* derinet) : derinet(derinet) {}
3247
3248	0		virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override {
3249	0		tagged_lemma current(lemma);
3250	0	0	if (converter) converter->convert(lemma);
		0
3251	0	0	for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) {
		0
3252	0	0	tagged_lemma parrent_lemma(parent.lemma, current.tag);
3253	0	0	if (converter) converter->convert(parrent_lemma);
		0
3254	0	0	lemma.lemma.append(" ").append(parrent_lemma.lemma);
3255			}
3256	0		}
3257
3258			private:
3259			const derivator* derinet;
3260			};
3261
3262	0		derivation_formatter* derivation_formatter::new_path_derivation_formatter(const derivator* derinet) {
3263	0	0	return derinet ? new path_derivation_formatter(derinet) : nullptr;
		0
3264			}
3265
3266	0		class tree_derivation_formatter : public derivation_formatter {
3267			public:
3268	0		tree_derivation_formatter(const derivator* derinet) : derinet(derinet) {}
3269
3270	0		virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override {
3271			string root(lemma.lemma), tag(lemma.tag);
3272	0	0	if (converter) converter->convert(lemma);
		0
3273	0	0	for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {}
		0
3274	0	0	format_tree(root, tag, lemma, converter);
3275	0		}
3276
3277	0		void format_tree(const string& root, const string& tag, tagged_lemma& tree, const tagset_converter* converter) const {
3278	0		vector children;
3279
3280	0	0	if (converter) {
3281	0	0	tagged_lemma current(root, tag);
3282	0	0	converter->convert(current);
3283	0	0	tree.lemma.append(" ").append(current.lemma);
3284			} else {
3285	0	0	tree.lemma.append(" ").append(root);
3286			}
3287
3288	0	0	if (derinet->children(root, children))
		0
3289	0	0	for (auto&& child : children)
3290	0	0	format_tree(child.lemma, tag, tree, converter);
3291	0	0	tree.lemma.push_back(' ');
3292	0		}
3293
3294			private:
3295			const derivator* derinet;
3296			};
3297
3298	0		derivation_formatter* derivation_formatter::new_tree_derivation_formatter(const derivator* derinet) {
3299	0	0	return derinet ? new tree_derivation_formatter(derinet) : nullptr;
		0
3300			}
3301
3302	0		derivation_formatter* derivation_formatter::new_derivation_formatter(string_piece name, const derivator* derinet) {
3303	0	0	if (name == "none") return new_none_derivation_formatter();
3304	0	0	if (name == "root") return new_root_derivation_formatter(derinet);
3305	0	0	if (name == "path") return new_path_derivation_formatter(derinet);
3306	0	0	if (name == "tree") return new_tree_derivation_formatter(derinet);
3307			return nullptr;
3308			}
3309
3310			} // namespace morphodita
3311
3312			/////////
3313			// File: morphodita/morpho/small_stringops.h
3314			/////////
3315
3316			// This file is part of MorphoDiTa .
3317			//
3318			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3319			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3320			//
3321			// This Source Code Form is subject to the terms of the Mozilla Public
3322			// License, v. 2.0. If a copy of the MPL was not distributed with this
3323			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3324
3325			namespace morphodita {
3326
3327			// Declarations
3328			inline bool small_memeq(const void* a, const void* b, size_t len);
3329			inline void small_memcpy(void* dest, const void* src, size_t len);
3330
3331			// Definitions
3332			bool small_memeq(const void* a_void, const void* b_void, size_t len) {
3333			const char* a = (const char*)a_void;
3334			const char* b = (const char*)b_void;
3335
3336	1980	0	while (len--)
		0
		0
		0
		0
		0
		0
		0
		0
		0
		100
		0
		0
		0
		0
		100
		100
		0
		100
		0
		0
		0
		0
		0
		0
		0
3337	1735	0	if (a++ != b++)
		0
		0
		0
		0
		0
		0
		0
		0
		0
		100
		0
		0
		0
		0
		100
		100
		0
		100
		0
		0
		0
		0
		0
		0
		0
3338			return false;
3339			return true;
3340			}
3341
3342			void small_memcpy(void* dest_void, const void* src_void, size_t len) {
3343			char* dest = (char*)dest_void;
3344			const char* src = (const char*)src_void;
3345
3346	1353	0	while (len--)
		0
		0
		0
		0
		100
		0
		0
		0
		100
		0
		0
		0
		0
		0
		100
3347	967		dest++ = src++;
3348			}
3349
3350			} // namespace morphodita
3351
3352			/////////
3353			// File: trainer/training_failure.h
3354			/////////
3355
3356			// This file is part of UDPipe .
3357			//
3358			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3359			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3360			//
3361			// This Source Code Form is subject to the terms of the Mozilla Public
3362			// License, v. 2.0. If a copy of the MPL was not distributed with this
3363			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3364
3365			namespace utils {
3366
3367	0		class training_error : public runtime_error {
3368			public:
3369			training_error();
3370
3371			static ostringstream message_collector;
3372			};
3373
3374			#define training_failure(message) throw (training_error::message_collector << message, training_error())
3375
3376			} // namespace utils
3377
3378			/////////
3379			// File: utils/binary_encoder.h
3380			/////////
3381
3382			// This file is part of UFAL C++ Utils .
3383			//
3384			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3385			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3386			//
3387			// This Source Code Form is subject to the terms of the Mozilla Public
3388			// License, v. 2.0. If a copy of the MPL was not distributed with this
3389			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3390
3391			namespace utils {
3392
3393			//
3394			// Declarations
3395			//
3396
3397	0		class binary_encoder {
3398			public:
3399			inline binary_encoder();
3400
3401			inline void add_1B(unsigned val);
3402			inline void add_2B(unsigned val);
3403			inline void add_4B(unsigned val);
3404			inline void add_float(double val);
3405			inline void add_double(double val);
3406			inline void add_str(string_piece str);
3407			inline void add_data(string_piece data);
3408			template inline void add_data(const vector& data);
3409			template inline void add_data(const T* data, size_t elements);
3410
3411			vector data;
3412			};
3413
3414			//
3415			// Definitions
3416			//
3417
3418	0		binary_encoder::binary_encoder() {
3419	0	0	data.reserve(16);
3420	0		}
3421
3422	0		void binary_encoder::add_1B(unsigned val) {
3423	0	0	if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!");
		0
		0
3424	0		data.push_back(val);
3425	0		}
3426
3427	0		void binary_encoder::add_2B(unsigned val) {
3428	0	0	if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!");
		0
		0
3429	0		data.insert(data.end(), (unsigned char) &val, ((unsigned char) &val) + sizeof(uint16_t));
3430	0		}
3431
3432			void binary_encoder::add_4B(unsigned val) {
3433			if (uint32_t(val) != val) training_failure("Should encode value " << val << " in four bytes!");
3434	0		data.insert(data.end(), (unsigned char) &val, ((unsigned char) &val) + sizeof(uint32_t));
3435			}
3436
3437			void binary_encoder::add_float(double val) {
3438			data.insert(data.end(), (unsigned char) &val, ((unsigned char) &val) + sizeof(float));
3439			}
3440
3441			void binary_encoder::add_double(double val) {
3442			data.insert(data.end(), (unsigned char) &val, ((unsigned char) &val) + sizeof(double));
3443			}
3444
3445	0		void binary_encoder::add_str(string_piece str) {
3446	0		add_1B(str.len < 255 ? str.len : 255);
3447	0	0	if (!(str.len < 255)) add_4B(str.len);
3448			add_data(str);
3449	0		}
3450
3451			void binary_encoder::add_data(string_piece data) {
3452	0		this->data.insert(this->data.end(), (const unsigned char) data.str, (const unsigned char) (data.str + data.len));
3453			}
3454
3455			template
3456			void binary_encoder::add_data(const vector& data) {
3457	0		this->data.insert(this->data.end(), (const unsigned char) data.data(), (const unsigned char) (data.data() + data.size()));
3458			}
3459
3460			template
3461			void binary_encoder::add_data(const T* data, size_t elements) {
3462	0		this->data.insert(this->data.end(), (const unsigned char) data, (const unsigned char) (data + elements));
3463			}
3464
3465			} // namespace utils
3466
3467			/////////
3468			// File: utils/pointer_decoder.h
3469			/////////
3470
3471			// This file is part of UFAL C++ Utils .
3472			//
3473			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3474			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3475			//
3476			// This Source Code Form is subject to the terms of the Mozilla Public
3477			// License, v. 2.0. If a copy of the MPL was not distributed with this
3478			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3479
3480			namespace utils {
3481
3482			//
3483			// Declarations
3484			//
3485
3486			class pointer_decoder {
3487			public:
3488			inline pointer_decoder(const unsigned char*& data);
3489			inline unsigned next_1B();
3490			inline unsigned next_2B();
3491			inline unsigned next_4B();
3492			inline void next_str(string& str);
3493			template inline const T* next(unsigned elements);
3494
3495			private:
3496			const unsigned char*& data;
3497			};
3498
3499			//
3500			// Definitions
3501			//
3502
3503	14		pointer_decoder::pointer_decoder(const unsigned char*& data) : data(data) {}
3504
3505			unsigned pointer_decoder::next_1B() {
3506	0		return *data++;
3507			}
3508
3509			unsigned pointer_decoder::next_2B() {
3510			uint16_t result;
3511	14		memcpy(&result, data, sizeof(uint16_t));
3512	14		data += sizeof(uint16_t);
3513	1		return result;
3514			}
3515
3516			unsigned pointer_decoder::next_4B() {
3517			uint32_t result;
3518	13		memcpy(&result, data, sizeof(uint32_t));
3519	13		data += sizeof(uint32_t);
3520			return result;
3521			}
3522
3523			void pointer_decoder::next_str(string& str) {
3524			unsigned len = next_1B();
3525			if (len == 255) len = next_4B();
3526			str.assign(next(len), len);
3527			}
3528
3529			template const T* pointer_decoder::next(unsigned elements) {
3530	3		const T* result = (const T*) data;
3531	0		data += sizeof(T) * elements;
3532			return result;
3533			}
3534
3535			} // namespace utils
3536
3537			/////////
3538			// File: utils/unaligned_access.h
3539			/////////
3540
3541			// This file is part of UFAL C++ Utils .
3542			//
3543			// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of
3544			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3545			//
3546			// This Source Code Form is subject to the terms of the Mozilla Public
3547			// License, v. 2.0. If a copy of the MPL was not distributed with this
3548			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3549
3550			namespace utils {
3551
3552			//
3553			// Declarations
3554			//
3555
3556			template
3557			inline T unaligned_load(const P* ptr);
3558
3559			template
3560			inline T unaligned_load_inc(const P*& ptr);
3561
3562			template
3563			inline void unaligned_store(P* ptr, T value);
3564
3565			template
3566			inline void unaligned_store_inc(P*& ptr, T value);
3567
3568			template
3569			T* unaligned_lower_bound(T* first, size_t size, T val);
3570
3571			template
3572			T* unaligned_upper_bound(T* first, size_t size, T val);
3573
3574			//
3575			// Definitions
3576			//
3577
3578			template
3579			inline T unaligned_load(const P* ptr) {
3580			T value;
3581			memcpy(&value, ptr, sizeof(T));
3582			return value;
3583			}
3584
3585			template
3586			inline T unaligned_load_inc(const P*& ptr) {
3587			T value;
3588			memcpy(&value, ptr, sizeof(T));
3589	0		((const char*&)ptr) += sizeof(T);
3590			return value;
3591			}
3592
3593			template
3594			inline void unaligned_store(P* ptr, T value) {
3595			memcpy(ptr, &value, sizeof(T));
3596			}
3597
3598			template
3599			inline void unaligned_store_inc(P*& ptr, T value) {
3600			memcpy(ptr, &value, sizeof(T));
3601	50		((char*&)ptr) += sizeof(T);
3602			}
3603
3604			template
3605			T* unaligned_lower_bound(T* first, size_t size, T val) {
3606	40	100	while (size) {
		0
		0
3607	30		size_t step = size >> 1;
3608	30	100	if (unaligned_load(first + step) < val) {
		0
		0
3609	9		first += step + 1;
3610	9		size -= step + 1;
3611			} else {
3612			size = step;
3613			}
3614			}
3615			return first;
3616			}
3617
3618			template
3619			T* unaligned_upper_bound(T* first, size_t size, T val) {
3620			while (size) {
3621			size_t step = size >> 1;
3622			if (!(val < unaligned_load(first + step))) {
3623			first += step + 1;
3624			size -= step + 1;
3625			} else {
3626			size = step;
3627			}
3628			}
3629			return first;
3630			}
3631
3632			} // namespace utils
3633
3634			/////////
3635			// File: morphodita/morpho/persistent_unordered_map.h
3636			/////////
3637
3638			// This file is part of MorphoDiTa .
3639			//
3640			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3641			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3642			//
3643			// This Source Code Form is subject to the terms of the Mozilla Public
3644			// License, v. 2.0. If a copy of the MPL was not distributed with this
3645			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3646
3647			namespace morphodita {
3648
3649			// Declarations
3650	103	0	class persistent_unordered_map {
		0
3651			public:
3652			// Accessing function
3653			template
3654			inline const unsigned char* at(const char* str, int len, EntrySize entry_size) const;
3655
3656			template
3657			inline const T* at_typed(const char* str, int len) const;
3658
3659			template
3660			inline void iter(const char* str, int len, EntryProcess entry_process) const;
3661
3662			template
3663			inline void iter_all(EntryProcess entry_process) const;
3664
3665			// Two helper functions accessing some internals
3666			inline int max_length() const;
3667			inline const unsigned char* data_start(int len) const;
3668
3669			// Creation functions
3670			persistent_unordered_map() {}
3671			template
3672			persistent_unordered_map(const unordered_map& map, double load_factor, EntryEncode entry_encode);
3673			template
3674			persistent_unordered_map(const unordered_map& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode);
3675
3676			// Manual creation functions
3677			inline void resize(unsigned elems);
3678			inline void add(const char* str, int str_len, int data_len);
3679			inline void done_adding();
3680			inline unsigned char* fill(const char* str, int str_len, int data_len);
3681			inline void done_filling();
3682
3683			// Serialization
3684			inline void load(binary_decoder& data);
3685			inline void save(binary_encoder& enc);
3686
3687			private:
3688			struct fnv_hash;
3689			vector hashes;
3690
3691			template
3692			void construct(const map& map, double load_factor, EntryEncode entry_encode);
3693			};
3694
3695			// Definitions
3696	1063	0	struct persistent_unordered_map::fnv_hash {
3697	24		fnv_hash(unsigned num) {
3698	24		mask = 1;
3699	76	100	while (mask < num)
3700	52		mask <<= 1;
3701	24	50	hash.resize(mask + 1);
3702	24		mask--;
3703	24		}
3704	484		fnv_hash(binary_decoder& data) {
3705	484	50	uint32_t size = data.next_4B();
3706	484		mask = size - 2;
3707	484	50	hash.resize(size);
3708	484	50	memcpy(hash.data(), data.next(size), size * sizeof(uint32_t));
3709
3710	484	50	size = data.next_4B();
3711	484	50	this->data.resize(size);
3712	484	100	if (size) memcpy(this->data.data(), data.next(size), size);
		50
3713	484		}
3714
3715			inline uint32_t index(const char* data, int len) const {
3716	464	0	if (len <= 0) return 0;
		0
		0
		0
		0
		0
		0
		0
		0
		50
		50
		0
		0
		0
		0
		0
		0
		50
		50
		50
		50
3717	456	0	if (len == 1) return unaligned_load(data);
		0
		0
		0
		0
		0
		0
		0
		0
		100
		0
		0
		0
		0
		0
		0
		0
		100
		100
		100
		100
3718	427	0	if (len == 2) return unaligned_load(data);
		0
		0
		0
		0
		0
		0
		0
		0
		100
		0
		0
		0
		0
		0
		0
		0
		100
		100
		100
		100
3719
3720			uint32_t hash = 2166136261U;
3721	1563	0	while (len--)
		0
		0
		0
		0
		0
		0
		0
		0
		100
		0
		0
		0
		0
		0
		0
		0
		100
		100
		100
		100
3722	1413		hash = (hash ^ unsigned((signed char)data++)) 16777619U;
3723	150		return hash & mask;
3724			}
3725
3726			inline void save(binary_encoder& enc);
3727
3728			unsigned mask;
3729			vector hash;
3730			vector data;
3731			};
3732
3733			template
3734	8		const unsigned char* persistent_unordered_map::at(const char* str, int len, EntrySize entry_size) const {
3735	8	0	if (unsigned(len) >= hashes.size()) return nullptr;
		0
		0
		0
		0
		0
		0
		0
		50
		0
		0
3736
3737	8		unsigned index = hashes[len].index(str, len);
3738	16		const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index];
3739	16		const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1];
3740
3741	8	0	if (len <= 2)
		0
		0
		0
		0
		0
		0
		0
		50
		0
		0
3742	8	0	return data != end ? data + len : nullptr;
		0
		0
		0
		0
		0
		0
		0
		50
		0
		0
3743
3744	0	0	while (data < end) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
3745	0	0	if (small_memeq(str, data, len)) return data + len;
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
3746	0		data += len;
3747			pointer_decoder decoder(data);
3748	0		entry_size(decoder);
3749			}
3750
3751			return nullptr;
3752			}
3753
3754			template
3755	438		const T* persistent_unordered_map::at_typed(const char* str, int len) const {
3756	438	100	if (unsigned(len) >= hashes.size()) return nullptr;
		100
3757
3758	408		unsigned index = hashes[len].index(str, len);
3759	816		const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index];
3760	816		const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1];
3761
3762	408	100	if (len <= 2)
		100
3763	293	100	return data != end ? (const T*)(data + len) : nullptr;
		100
3764
3765	146	100	while (data < end) {
		100
3766	133	100	if (small_memeq(str, data, len)) return (const T*)(data + len);
		100
3767	31		data += len + sizeof(T);
3768			}
3769
3770			return nullptr;
3771			}
3772
3773			template
3774	8		void persistent_unordered_map::iter(const char* str, int len, EntryProcess entry_process) const {
3775	8	0	if (unsigned(len) >= hashes.size()) return;
		50
		0
		0
		0
		0
3776
3777	8		unsigned index = hashes[len].index(str, len);
3778	16		const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index];
3779	8		const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1];
3780
3781	21	0	while (data < end) {
		100
		0
		0
		0
		0
3782			auto start = (const char*) data;
3783	13		data += len;
3784			pointer_decoder decoder(data);
3785	13		entry_process(start, decoder);
3786			}
3787			}
3788
3789			template
3790	2		void persistent_unordered_map::iter_all(EntryProcess entry_process) const {
3791	2	100	for (unsigned len = 0; len < hashes.size(); len++) {
		0
		0
3792	1		const unsigned char* data = hashes[len].data.data();
3793			const unsigned char* end = data + hashes[len].data.size();
3794
3795	2	100	while (data < end) {
		0
		0
3796			auto start = (const char*) data;
3797	1		data += len;
3798			pointer_decoder decoder(data);
3799	1		entry_process(start, len, decoder);
3800			}
3801			}
3802	1		}
3803
3804			int persistent_unordered_map::max_length() const {
3805	20		return hashes.size();
3806			}
3807
3808			const unsigned char* persistent_unordered_map::data_start(int len) const {
3809	30	0	return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr;
		0
		0
		0
		0
		0
		0
3810			}
3811
3812	24		void persistent_unordered_map::resize(unsigned elems) {
3813	24	100	if (hashes.size() == 0) hashes.emplace_back(1);
3814	22	100	else if (hashes.size() == 1) hashes.emplace_back(1<<8);
3815	20	100	else if (hashes.size() == 2) hashes.emplace_back(1<<16);
3816	18		else hashes.emplace_back(elems);
3817	24		}
3818
3819	20		void persistent_unordered_map::add(const char* str, int str_len, int data_len) {
3820	20	50	if (unsigned(str_len) < hashes.size())
3821	20		hashes[str_len].hash[hashes[str_len].index(str, str_len)] += str_len + data_len;
3822	20		}
3823
3824	2		void persistent_unordered_map::done_adding() {
3825	26	100	for (auto&& hash : hashes) {
3826			int total = 0;
3827	131657	100	for (auto&& len : hash.hash) total += len, len = total - len;
3828	24		hash.data.resize(total);
3829			}
3830	2		}
3831
3832	20		unsigned char* persistent_unordered_map::fill(const char* str, int str_len, int data_len) {
3833	20	50	if (unsigned(str_len) < hashes.size()) {
3834	20		unsigned index = hashes[str_len].index(str, str_len);
3835	40		unsigned offset = hashes[str_len].hash[index];
3836	20		small_memcpy(hashes[str_len].data.data() + offset, str, str_len);
3837	20		hashes[str_len].hash[index] += str_len + data_len;
3838	20		return hashes[str_len].data.data() + offset + str_len;
3839			}
3840			return nullptr;
3841			}
3842
3843	2		void persistent_unordered_map::done_filling() {
3844	26	100	for (auto&& hash : hashes)
3845	131657	100	for (int i = hash.hash.size() - 1; i >= 0; i--)
3846	131633	100	hash.hash[i] = i > 0 ? hash.hash[i-1] : 0;
3847	2		}
3848
3849	103		void persistent_unordered_map::load(binary_decoder& data) {
3850	103		unsigned sizes = data.next_1B();
3851
3852			hashes.clear();
3853	587	100	for (unsigned i = 0; i < sizes; i++)
3854	484		hashes.emplace_back(data);
3855	103		}
3856
3857			} // namespace morphodita
3858
3859			/////////
3860			// File: morphodita/derivator/derivator_dictionary.h
3861			/////////
3862
3863			// This file is part of MorphoDiTa .
3864			//
3865			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3866			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3867			//
3868			// This Source Code Form is subject to the terms of the Mozilla Public
3869			// License, v. 2.0. If a copy of the MPL was not distributed with this
3870			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3871
3872			namespace morphodita {
3873
3874	0		class derivator_dictionary : public derivator {
3875			public:
3876			virtual bool parent(string_piece lemma, derivated_lemma& parent) const override;
3877			virtual bool children(string_piece lemma, vector& children) const override;
3878
3879			bool load(istream& is);
3880
3881			private:
3882			friend class morpho;
3883			const morpho* dictionary;
3884			persistent_unordered_map derinet;
3885			};
3886
3887			} // namespace morphodita
3888
3889			/////////
3890			// File: utils/compressor.h
3891			/////////
3892
3893			// This file is part of UFAL C++ Utils .
3894			//
3895			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
3896			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3897			//
3898			// This Source Code Form is subject to the terms of the Mozilla Public
3899			// License, v. 2.0. If a copy of the MPL was not distributed with this
3900			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3901
3902			namespace utils {
3903
3904			class binary_decoder;
3905			class binary_encoder;
3906
3907			class compressor {
3908			public:
3909			static bool load(istream& is, binary_decoder& data);
3910			static bool save(ostream& os, const binary_encoder& enc);
3911			};
3912
3913			} // namespace utils
3914
3915			/////////
3916			// File: morphodita/derivator/derivator_dictionary.cpp
3917			/////////
3918
3919			// This file is part of MorphoDiTa .
3920			//
3921			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
3922			// Mathematics and Physics, Charles University in Prague, Czech Republic.
3923			//
3924			// This Source Code Form is subject to the terms of the Mozilla Public
3925			// License, v. 2.0. If a copy of the MPL was not distributed with this
3926			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
3927
3928			namespace morphodita {
3929
3930	0		bool derivator_dictionary::parent(string_piece lemma, derivated_lemma& parent) const {
3931	0	0	if (dictionary) lemma.len = dictionary->lemma_id_len(lemma);
3932
3933	0		auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) {
3934			data.next(data.next_1B());
3935			data.next_4B();
3936			data.next(data.next_2B());
3937	0		});
3938	0	0	if (lemma_data) {
3939	0		auto parent_encoded = (uint32_t)(lemma_data + 1 + *lemma_data);
3940	0	0	if (parent_encoded) {
3941	0		unsigned parent_len = parent_encoded & 0xFF;
3942	0		auto parent_data = derinet.data_start(parent_len) + (parent_encoded >> 8);
3943	0		parent.lemma.assign((const char*) parent_data, parent_len);
3944	0	0	if (parent_data[parent_len])
3945	0		parent.lemma.append((const char*) parent_data + parent_len + 1, parent_data[parent_len]);
3946			return true;
3947			}
3948			}
3949			parent.lemma.clear();
3950	0		return false;
3951			}
3952
3953	0		bool derivator_dictionary::children(string_piece lemma, vector& children) const {
3954	0	0	if (dictionary) lemma.len = dictionary->lemma_id_len(lemma);
3955
3956	0		auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) {
3957			data.next(data.next_1B());
3958			data.next_4B();
3959			data.next(data.next_2B());
3960	0		});
3961	0	0	if (lemma_data) {
3962	0		auto children_len = (uint16_t)(lemma_data + 1 + *lemma_data + 4);
3963	0		auto children_encoded = (uint32_t)(lemma_data + 1 + lemma_data + 4 + 2);
3964	0	0	if (children_len) {
3965	0		children.resize(children_len);
3966	0	0	for (unsigned i = 0; i < children_len; i++) {
3967	0		unsigned child_len = children_encoded[i] & 0xFF;
3968	0		auto child_data = derinet.data_start(child_len) + (children_encoded[i] >> 8);
3969	0		children[i].lemma.assign((const char*) child_data, child_len);
3970	0	0	if (child_data[child_len])
3971	0		children[i].lemma.append((const char*) child_data + child_len + 1, child_data[child_len]);
3972			}
3973			return true;
3974			}
3975			}
3976	0		children.clear();
3977	0		return false;
3978			}
3979
3980	0		bool derivator_dictionary::load(istream& is) {
3981			binary_decoder data;
3982	0	0	if (!compressor::load(is, data)) return false;
		0
3983
3984			try {
3985	0	0	for (int i = data.next_1B(); i > 0; i--)
		0
3986	0	0	derinet.resize(data.next_4B());
		0
3987
3988			unsigned data_position = data.tell();
3989			vector lemma, parent;
3990	0	0	for (int pass = 1; pass <= 3; pass++) {
3991	0	0	if (pass > 1) data.seek(data_position);
		0
3992
3993			lemma.clear();
3994	0	0	for (int i = data.next_4B(); i > 0; i--) {
		0
3995	0	0	lemma.resize(lemma.size() - data.next_1B());
		0
3996	0	0	for (int i = data.next_1B(); i > 0; i--)
		0
3997	0	0	lemma.push_back(data.next_1B());
3998
3999	0	0	unsigned char lemma_comment_len = data.next_1B();
4000	0	0	const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr;
		0
4001
4002	0	0	unsigned children = data.next_2B();
4003
4004	0	0	if (pass == 3) parent.clear();
4005			enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
4006	0	0	int operations = data.next_1B();
4007	0	0	if (operations) {
4008	0	0	int remove_start = operations & REMOVE_START ? data.next_1B() : 0;
		0
4009	0	0	int remove_end = operations & REMOVE_END ? data.next_1B() : 0;
		0
4010	0	0	if (operations & ADD_START) {
4011	0	0	int add_start = data.next_1B();
4012	0	0	const char* str = data.next(add_start);
4013	0	0	if (pass == 3) parent.assign(str, str + add_start);
4014			}
4015	0	0	if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end);
		0
4016	0	0	if (operations & ADD_END) {
4017	0	0	int add_end = data.next_1B();
4018	0	0	const char* str = data.next(add_end);
4019	0	0	if (pass == 3) parent.insert(parent.end(), str, str + add_end);
4020			}
4021			}
4022
4023	0	0	if (pass == 1) {
4024	0		derinet.add(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children);
4025	0	0	} else if (pass == 2) {
4026	0		unsigned char* lemma_data = derinet.fill(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children);
4027	0		*lemma_data++ = lemma_comment_len;
4028	0	0	while (lemma_comment_len--) lemma_data++ = lemma_comment++;
4029			unaligned_store_inc(lemma_data, 0);
4030			unaligned_store_inc(lemma_data, children);
4031	0	0	if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0);
4032	0	0	} else if (pass == 3 && !parent.empty()) {
		0
		0
4033	0		auto lemma_data = derinet.at(lemma.data(), lemma.size(), [](pointer_decoder& data) {
4034			data.next(data.next_1B());
4035			data.next_4B();
4036			data.next(data.next_2B());
4037	0		});
4038	0		auto parent_data = derinet.at(parent.data(), parent.size(), [](pointer_decoder& data) {
4039			data.next(data.next_1B());
4040			data.next_4B();
4041			data.next(data.next_2B());
4042	0		});
4043	0	0	assert(lemma_data && parent_data);
4044
4045	0		unsigned parent_offset = parent_data - parent.size() - derinet.data_start(parent.size());
4046	0	0	assert(parent.size() < (1<<8) && parent_offset < (1<<24));
		0
4047	0		unaligned_store((void )(lemma_data + 1 + lemma_data), (parent_offset << 8) \| parent.size());
4048
4049	0		unsigned lemma_offset = lemma_data - lemma.size() - derinet.data_start(lemma.size());
4050	0	0	assert(lemma.size() < (1<<8) && lemma_offset < (1<<24));
		0
4051	0		auto children_len = unaligned_load(parent_data + 1 + *parent_data + 4);
4052	0		auto children = (uint32_t)(parent_data + 1 + parent_data + 4 + 2);
4053	0		auto child_index = unaligned_load(children + children_len - 1);
4054	0		unaligned_store(children + child_index, (lemma_offset << 8) \| lemma.size());
4055	0	0	if (child_index+1 < children_len)
4056	0		unaligned_store(children + children_len - 1, unaligned_load(children + children_len - 1) + 1);
4057			}
4058			}
4059
4060	0	0	if (pass == 1)
4061	0	0	derinet.done_adding();
4062	0	0	if (pass == 2)
4063	0		derinet.done_filling();
4064		0	}
4065			} catch (binary_decoder_error&) {
4066			return false;
4067			}
4068	0		return true;
4069			}
4070
4071			} // namespace morphodita
4072
4073			/////////
4074			// File: morphodita/morpho/casing_variants.h
4075			/////////
4076
4077			// This file is part of MorphoDiTa .
4078			//
4079			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4080			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4081			//
4082			// This Source Code Form is subject to the terms of the Mozilla Public
4083			// License, v. 2.0. If a copy of the MPL was not distributed with this
4084			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4085
4086			namespace morphodita {
4087
4088	7		inline void generate_casing_variants(string_piece form, string& form_uclc, string& form_lc) {
4089			using namespace unilib;
4090
4091			// Detect uppercase+titlecase characters.
4092			bool first_Lut = false; // first character is uppercase or titlecase
4093			bool rest_has_Lut = false; // any character but first is uppercase or titlecase
4094			{
4095	7		string_piece form_tmp = form;
4096	14		first_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut;
4097	29	100	while (form_tmp.len && !rest_has_Lut)
		50
4098	22		rest_has_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut;
4099			}
4100
4101			// Generate all casing variants if needed (they are different than given form).
4102			// We only replace letters with their lowercase variants.
4103			// - form_uclc: first uppercase, rest lowercase
4104			// - form_lc: all lowercase
4105
4106	7	100	if (first_Lut && !rest_has_Lut) { // common case allowing fast execution
4107	1		form_lc.reserve(form.len);
4108	1		string_piece form_tmp = form;
4109	1		utf8::append(form_lc, unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len)));
4110	1		form_lc.append(form_tmp.str, form_tmp.len);
4111	6	50	} else if (!first_Lut && rest_has_Lut) {
4112	0		form_lc.reserve(form.len);
4113	0		utf8::map(unicode::lowercase, form.str, form.len, form_lc);
4114	6	50	} else if (first_Lut && rest_has_Lut) {
4115	0		form_lc.reserve(form.len);
4116	0		form_uclc.reserve(form.len);
4117	0		string_piece form_tmp = form;
4118	0		char32_t first = utf8::decode(form_tmp.str, form_tmp.len);
4119	0		utf8::append(form_lc, unicode::lowercase(first));
4120	0		utf8::append(form_uclc, first);
4121	0	0	while (form_tmp.len) {
4122	0		char32_t lowercase = unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len));
4123	0		utf8::append(form_lc, lowercase);
4124	0		utf8::append(form_uclc, lowercase);
4125			}
4126			}
4127	7		}
4128
4129			} // namespace morphodita
4130
4131			/////////
4132			// File: morphodita/morpho/czech_lemma_addinfo.h
4133			/////////
4134
4135			// This file is part of MorphoDiTa .
4136			//
4137			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4138			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4139			//
4140			// This Source Code Form is subject to the terms of the Mozilla Public
4141			// License, v. 2.0. If a copy of the MPL was not distributed with this
4142			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4143
4144			namespace morphodita {
4145
4146			// Declarations
4147	0		struct czech_lemma_addinfo {
4148			inline static int raw_lemma_len(string_piece lemma);
4149			inline static int lemma_id_len(string_piece lemma);
4150			inline static string format(const unsigned char* addinfo, int addinfo_len);
4151			inline static bool generatable(const unsigned char* addinfo, int addinfo_len);
4152
4153			inline int parse(string_piece lemma, bool die_on_failure = false);
4154			inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len);
4155
4156			vector data;
4157			};
4158
4159			// Definitions
4160	0		int czech_lemma_addinfo::raw_lemma_len(string_piece lemma) {
4161			// Lemma ends by a '-[0-9]', '`' or '_' on non-first position.
4162	0	0	for (unsigned len = 1; len < lemma.len; len++)
4163	0	0	if (lemma.str[len] == '`' \|\| lemma.str[len] == '_' \|\|
		0
4164	0	0	(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9'))
		0
		0
4165	0		return len;
4166	0		return lemma.len;
4167			}
4168
4169	0		int czech_lemma_addinfo::lemma_id_len(string_piece lemma) {
4170			// Lemma ends by a '-[0-9]', '`' or '_' on non-first position.
4171	0	0	for (unsigned len = 1; len < lemma.len; len++) {
4172	0	0	if (lemma.str[len] == '`' \|\| lemma.str[len] == '_')
4173	0		return len;
4174	0	0	if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') {
		0
		0
		0
4175	0		len += 2;
4176	0	0	while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++;
		0
		0
4177	0		return len;
4178			}
4179			}
4180	0		return lemma.len;
4181			}
4182
4183	0		string czech_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) {
4184			string res;
4185
4186	0	0	if (addinfo_len) {
4187	0	0	res.reserve(addinfo_len + 4);
4188	0	0	if (addinfo[0] != 255) {
4189			char num[5];
4190	0		snprintf(num, sizeof(num), "-%u", addinfo[0]);
4191			res += num;
4192			}
4193	0	0	for (int i = 1; i < addinfo_len; i++)
4194	0		res += addinfo[i];
4195			}
4196
4197	0		return res;
4198			}
4199
4200			bool czech_lemma_addinfo::generatable(const unsigned char* addinfo, int addinfo_len) {
4201	0	0	for (int i = 1; i + 2 < addinfo_len; i++)
4202	0	0	if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x')
		0
		0
4203			return false;
4204
4205			return true;
4206			}
4207
4208	0		int czech_lemma_addinfo::parse(string_piece lemma, bool die_on_failure) {
4209			data.clear();
4210
4211	0		const char* lemma_info = lemma.str + raw_lemma_len(lemma);
4212	0	0	if (lemma_info < lemma.str + lemma.len) {
4213	0		int lemma_num = 255;
4214			const char* lemma_additional_info = lemma_info;
4215
4216	0	0	if (*lemma_info == '-') {
4217	0		lemma_num = 0;
4218	0		for (lemma_additional_info = lemma_info + 1;
4219	0	0	lemma_additional_info < lemma.str + lemma.len && (lemma_additional_info >= '0' && lemma_additional_info <= '9');
		0
4220			lemma_additional_info++)
4221	0		lemma_num = 10 * lemma_num + (*lemma_additional_info - '0');
4222
4223	0	0	if (lemma_additional_info == lemma_info + 1 \|\| (lemma_additional_info < lemma.str + lemma.len && lemma_additional_info != '`' && lemma_additional_info != '_') \|\| lemma_num >= 255) {
		0
		0
		0
		0
4224	0	0	if (die_on_failure)
4225	0	0	training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!");
		0
		0
4226			else
4227	0		lemma_num = 255;
4228			}
4229			}
4230	0		data.emplace_back(lemma_num);
4231	0	0	while (lemma_additional_info < lemma.str + lemma.len)
4232	0		data.push_back((unsigned char)lemma_additional_info++);
4233
4234	0	0	if (data.size() > 255) {
4235	0	0	if (die_on_failure)
4236	0	0	training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!');
		0
4237			else
4238	0		data.resize(255);
4239			}
4240			}
4241
4242	0		return lemma_info - lemma.str;
4243			}
4244
4245			bool czech_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) {
4246	0	0	if (data.empty()) return true;
4247	0	0	if (data[0] != 255 && (!other_addinfo_len \|\| other_addinfo[0] != data[0])) return false;
		0
		0
		0
4248			return true;
4249			}
4250
4251			} // namespace morphodita
4252
4253			/////////
4254			// File: morphodita/morpho/tag_filter.h
4255			/////////
4256
4257			// This file is part of MorphoDiTa .
4258			//
4259			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4260			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4261			//
4262			// This Source Code Form is subject to the terms of the Mozilla Public
4263			// License, v. 2.0. If a copy of the MPL was not distributed with this
4264			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4265
4266			namespace morphodita {
4267
4268			// Declarations
4269	0		class tag_filter {
4270			public:
4271			tag_filter(const char* filter = nullptr);
4272
4273			inline bool matches(const char* tag) const;
4274
4275			private:
4276			struct char_filter {
4277			char_filter(int pos, bool negate, int chars_offset, int chars_len)
4278	0		: pos(pos), negate(negate), chars_offset(chars_offset), chars_len(chars_len) {}
4279
4280			int pos;
4281			bool negate;
4282			int chars_offset, chars_len;
4283			};
4284
4285			string wildcard;
4286			std::vector filters;
4287			};
4288
4289			// Definitions
4290	0		inline bool tag_filter::matches(const char* tag) const {
4291	0	0	if (filters.empty()) return true;
4292
4293			int tag_pos = 0;
4294	0	0	for (auto&& filter : filters) {
4295			// Skip until next filter position. If the tag ends prematurely, accept.
4296	0	0	while (tag_pos < filter.pos)
4297	0	0	if (!tag[tag_pos++])
4298			return true;
4299	0	0	if (!tag[tag_pos])
4300			return true;
4301
4302			// We assume filter.chars_len >= 1.
4303	0		bool matched = (wildcard[filter.chars_offset] == tag[tag_pos]) ^ filter.negate;
4304	0	0	for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++)
		0
4305	0		matched = (wildcard[filter.chars_offset + i] == tag[tag_pos]) ^ filter.negate;
4306	0	0	if (!matched) return false;
4307			}
4308			return true;
4309			}
4310
4311			} // namespace morphodita
4312
4313			/////////
4314			// File: morphodita/morpho/morpho_dictionary.h
4315			/////////
4316
4317			// This file is part of MorphoDiTa .
4318			//
4319			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4320			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4321			//
4322			// This Source Code Form is subject to the terms of the Mozilla Public
4323			// License, v. 2.0. If a copy of the MPL was not distributed with this
4324			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4325
4326			namespace morphodita {
4327
4328			// Declarations
4329			template
4330	2		class morpho_dictionary {
4331			public:
4332			void load(binary_decoder& data);
4333			void analyze(string_piece form, vector& lemmas) const;
4334			bool generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms) const;
4335			private:
4336			persistent_unordered_map lemmas, roots, suffixes;
4337
4338			vector tags;
4339			vector>>> classes;
4340			};
4341
4342			// Definitions
4343			template
4344	1		void morpho_dictionary::load(binary_decoder& data) {
4345			// Prepare lemmas and roots hashes
4346	13	100	for (int i = data.next_1B(); i > 0; i--)
		0
		0
4347	12		lemmas.resize(data.next_4B());
4348	13	100	for (int i = data.next_1B(); i > 0; i--)
		0
		0
4349	12		roots.resize(data.next_4B());
4350
4351			// Perform two pass over the lemmas and roots data, filling the hashes.
4352
4353	1		vector lemma(max(lemmas.max_length(), roots.max_length()));
4354	1	50	vector root(max(lemmas.max_length(), roots.max_length()));
		0
		0
4355			unsigned data_position = data.tell();
4356	3	100	for (int pass = 1; pass <= 2; pass++) {
		0
		0
4357	2	100	if (pass > 1) data.seek(data_position);
		50
		0
		0
		0
		0
4358
4359			int lemma_len = 0;
4360			int root_len = 0;
4361
4362	22	50	for (int i = data.next_4B(); i > 0; i--) {
		100
		0
		0
		0
		0
4363	20	50	lemma_len -= data.next_1B();
		0
		0
4364	126	50	for (int i = data.next_1B(); i > 0; i--)
		100
		0
		0
		0
		0
4365	106	50	lemma[lemma_len++] = data.next_1B();
		0
		0
4366	20	50	unsigned char lemma_info_len = data.next_1B();
		0
		0
4367	20	50	const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr;
		0
		0
		0
		0
		0
4368	20	50	unsigned lemma_roots = data.next_1B();
		0
		0
4369
4370			unsigned char* lemma_data /* to keep compiler happy */ = nullptr;
4371			unsigned lemma_offset /* to keep compiler happy */ = 0;
4372
4373	20	100	if (pass == 1) {
		0
		0
4374	10		lemmas.add(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t)));
4375			} else /if (pass == 2)/ {
4376	10		lemma_data = lemmas.fill(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t)));
4377	20		lemma_offset = lemma_data - lemma_len - lemmas.data_start(lemma_len);
4378
4379	10		*lemma_data++ = lemma_info_len;
4380	10	50	if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len;
		0
		0
4381	10		*lemma_data++ = lemma_roots;
4382			}
4383
4384	20		small_memcpy(root.data(), lemma.data(), lemma_len); root_len = lemma_len;
4385	40	100	for (unsigned i = 0; i < lemma_roots; i++) {
		0
		0
4386			enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
4387	20	50	int operations = data.next_1B();
		0
		0
4388	48	100	if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; }
		50
		100
		0
		0
		0
		0
		0
		0
4389	20	100	if (operations & REMOVE_END) root_len -= data.next_1B();
		50
		0
		0
		0
		0
4390	20	100	if (operations & ADD_START) {
		0
		0
4391	44	50	int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to;
		100
		0
		0
		0
		0
4392	14	100	for (int i = 0; i < to; i++) root[i] = data.next_1B();
		50
		0
		0
		0
		0
4393			}
4394	20	100	if (operations & ADD_END)
		0
		0
4395	34	50	for (int len = data.next_1B(); len > 0; len--)
		100
		0
		0
		0
		0
4396	22	50	root[root_len++] = data.next_1B();
		0
		0
4397	20	50	uint16_t clas = data.next_2B();
		0
		0
4398
4399	20	100	if (pass == 1) { // for each root
		0
		0
4400	10		roots.add(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t));
4401			} else /if (pass == 2)/ {
4402	10		unsigned char* root_data = roots.fill(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t));
4403	20		unsigned root_offset = root_data - root_len - roots.data_start(root_len);
4404
4405			unaligned_store_inc(root_data, clas);
4406			unaligned_store_inc(root_data, lemma_offset);
4407			unaligned_store_inc(root_data, lemma_len);
4408	10	50	assert(uint8_t(lemma_len) == lemma_len);
		0
		0
4409
4410			unaligned_store_inc(lemma_data, root_offset);
4411			unaligned_store_inc(lemma_data, root_len);
4412			unaligned_store_inc(lemma_data, clas);
4413	10	50	assert(uint8_t(root_len) == root_len);
		0
		0
4414			}
4415			}
4416			}
4417
4418	2	100	if (pass == 1) { // after the whole pass
		0
		0
4419	1	50	lemmas.done_adding();
		0
		0
4420	1	50	roots.done_adding();
		0
		0
4421			} else /if (pass == 2)/ {
4422	1		lemmas.done_filling();
4423	1		roots.done_filling();
4424			}
4425			}
4426
4427			// Load tags
4428	1	50	tags.resize(data.next_2B());
		50
		0
		0
		0
		0
4429	7	100	for (auto&& tag : tags) {
		0
		0
4430	6	50	tag.resize(data.next_1B());
		0
		0
4431	403	100	for (unsigned i = 0; i < tag.size(); i++)
		0
		0
4432	397	50	tag[i] = data.next_1B();
		0
		0
4433			}
4434
4435			// Load suffixes
4436	1	50	suffixes.load(data);
		0
		0
4437
4438			// Fill classes from suffixes
4439	2	50	suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable {
		0
		0
4440			unsigned classes_len = data.next_2B();
4441			const uint16_t* classes_ptr = data.next(classes_len);
4442	1		const uint16_t* indices_ptr = data.next(classes_len + 1);
4443	1		uint32_t tags_len = unaligned_load(indices_ptr);
4444	7	100	for (unsigned i = 0; i < classes_len; i++)
		0
		0
4445	6		tags_len += uint16_t(unaligned_load(indices_ptr + i + 1) - unaligned_load(indices_ptr + i));
4446			const uint16_t* tags_ptr = data.next(tags_len);
4447
4448	1		string suffix_str(suffix, len);
4449	1		uint32_t index = unaligned_load(indices_ptr), prev_index = 0;
4450	7	100	for (unsigned i = 0; i < classes_len; i++) {
		0
		0
4451	6		auto classes_ptr_i = unaligned_load(classes_ptr + i);
4452	6	50	if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1);
		50
		0
		0
		0
		0
4453			prev_index = index;
4454	6		index += uint16_t(unaligned_load(indices_ptr + i + 1) - unaligned_load(indices_ptr + i));
4455	6	50	classes[classes_ptr_i].emplace_back(suffix_str, vector());
		0
		0
4456	12	100	for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++)
		0
		0
4457	6	50	classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr));
		0
		0
4458			}
4459	1		});
4460	1		}
4461
4462			template
4463	8		void morpho_dictionary::analyze(string_piece form, vector& lemmas) const {
4464			int max_suffix_len = suffixes.max_length();
4465
4466			uint16_t* suff_stack[16]; vector suff_heap;
4467	8	50	uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data());
		0
		0
		0
		0
		0
4468			int suff_len = 0;
4469	16	100	for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) {
		0
		0
4470	8		suff[suff_len] = (uint16_t*) suffixes.at(form.str + i, suff_len, [](pointer_decoder& data) {
4471	0		data.next(2 * data.next_2B());
4472			data.next(data.next_2B());
4473	0		});
4474	8		if (!suff[suff_len]) break;
4475			}
4476
4477	16	100	for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++)
		50
		100
		0
		0
		0
		0
		0
		0
4478	8	50	if (unaligned_load(suff[suff_len])) {
		0
		0
4479	8		unsigned suff_classes = unaligned_load(suff[suff_len]);
4480	8		uint16_t* suff_data = suff[suff_len] + 1;
4481
4482	21	50	roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) {
		0
		0
4483			uint16_t root_class = root_data.next_2B();
4484			unsigned lemma_offset = root_data.next_4B();
4485			unsigned lemma_len = root_data.next_1B();
4486
4487	26	100	if (small_memeq(form.str, root, root_len)) {
		0
		0
4488	19		uint16_t* suffix_class_ptr = unaligned_lower_bound(suff_data, suff_classes, root_class);
4489	10	50	if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) {
		50
		50
		0
		0
		0
		0
		0
		0
4490	30		const unsigned char* lemma_data = this->lemmas.data_start(lemma_len) + lemma_offset;
4491			string lemma((const char*)lemma_data, lemma_len);
4492	10	50	if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]);
		0
		0
		0
4493
4494	20		uint16_t* suff_tag_indices = suff_data + suff_classes;
4495	10		uint16_t* suff_tags = suff_tag_indices + suff_classes + 1;
4496	20	100	for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data));
		0
		0
4497	20		i < unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data) + 1); i++)
4498	10	50	lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]);
		0
		0
4499			}
4500			}
4501	13		});
4502			}
4503	8		}
4504
4505			template
4506	0		bool morpho_dictionary::generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms) const {
4507			LemmaAddinfo addinfo;
4508	0	0	int raw_lemma_len = addinfo.parse(lemma);
		0
4509	0		bool matched_lemma = false;
4510
4511	0	0	lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) {
		0
		0
4512			unsigned lemma_info_len = data.next_1B();
4513			const auto* lemma_info = data.next(lemma_info_len);
4514			unsigned lemma_roots_len = data.next_1B();
4515	0		auto* lemma_roots_ptr = data.next(lemma_roots_len * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t)));
4516
4517	0	0	if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) {
		0
		0
		0
		0
		0
		0
		0
		0
4518	0		matched_lemma = true;
4519
4520			vector* forms = nullptr;
4521			pointer_decoder lemma_roots(lemma_roots_ptr);
4522	0	0	for (unsigned i = 0; i < lemma_roots_len; i++) {
		0
		0
4523			unsigned root_offset = lemma_roots.next_4B();
4524			unsigned root_len = lemma_roots.next_1B();
4525			unsigned clas = lemma_roots.next_2B();
4526
4527	0		const unsigned char* root_data = roots.data_start(root_len) + root_offset;
4528	0	0	for (auto&& suffix : classes[clas]) {
		0
		0
4529			string root_with_suffix;
4530	0	0	for (auto&& tag : suffix.second)
		0
		0
4531	0	0	if (filter.matches(tags[tag].c_str())) {
		0
		0
4532	0	0	if (!forms) {
		0
		0
4533	0	0	lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len));
		0
		0
		0
		0
		0
		0
4534	0		forms = &lemmas_forms.back().forms;
4535			}
4536
4537	0	0	if (root_with_suffix.empty() && root_len + suffix.first.size()) {
		0
		0
		0
		0
		0
		0
		0
		0
4538	0	0	root_with_suffix.reserve(root_len + suffix.first.size());
		0
		0
4539			root_with_suffix.assign((const char*)root_data, root_len);
4540			root_with_suffix.append(suffix.first);
4541			}
4542
4543	0	0	forms->emplace_back(root_with_suffix, tags[tag]);
		0
		0
4544			}
4545			}
4546			}
4547			}
4548	0		});
4549
4550	0		return matched_lemma;
4551			}
4552
4553			} // namespace morphodita
4554
4555			/////////
4556			// File: morphodita/morpho/morpho_prefix_guesser.h
4557			/////////
4558
4559			// This file is part of MorphoDiTa .
4560			//
4561			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4562			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4563			//
4564			// This Source Code Form is subject to the terms of the Mozilla Public
4565			// License, v. 2.0. If a copy of the MPL was not distributed with this
4566			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4567
4568			namespace morphodita {
4569
4570			// Declarations
4571			template
4572	0		class morpho_prefix_guesser {
4573			public:
4574	0		morpho_prefix_guesser(const MorphoDictionary& dictionary) : dictionary(dictionary) {}
4575
4576			void load(binary_decoder& data);
4577			void analyze(string_piece form, vector& lemmas);
4578			bool generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms);
4579
4580			private:
4581			const MorphoDictionary& dictionary;
4582			vector tag_filters;
4583			persistent_unordered_map prefixes_initial, prefixes_middle;
4584			};
4585
4586			// Definitions
4587			template
4588	0		void morpho_prefix_guesser::load(binary_decoder& data) {
4589			// Load and construct tag filters
4590	0	0	for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) {
4591	0		unsigned tag_filter_len = data.next_1B();
4592	0		string tag_filter(data.next(tag_filter_len), tag_filter_len);
4593
4594	0	0	tag_filters.emplace_back(tag_filter.c_str());
4595			}
4596
4597			// Load prefixes
4598	0		prefixes_initial.load(data);
4599	0		prefixes_middle.load(data);
4600	0		}
4601
4602			// Analyze can return non-unique lemma-tag pairs.
4603			template
4604	0		void morpho_prefix_guesser::analyze(string_piece form, vector& lemmas) {
4605	0	0	if (!form.len) return;
4606
4607			vector form_tmp;
4608			vector middle_masks;
4609	0	0	middle_masks.reserve(form.len);
4610
4611	0	0	for (unsigned initial = 0; initial < form.len; initial++) {
4612			// Match the initial prefix.
4613	0		unsigned initial_mask = (1<
4614	0	0	if (initial) {
4615	0		auto found = prefixes_initial.at_typed(form.str, initial);
4616	0	0	if (!found) break;
4617	0		initial_mask = unaligned_load(found);
4618			}
4619
4620			// If we have found an initial prefix (including the empty one), match middle prefixes.
4621	0	0	if (initial_mask) {
4622	0	0	middle_masks.resize(initial);
4623	0	0	middle_masks.emplace_back(initial_mask);
4624	0	0	for (unsigned middle = initial; middle < middle_masks.size(); middle++) {
4625	0	0	if (!middle_masks[middle]) continue;
4626			// Try matching middle prefixes from current index.
4627	0	0	for (unsigned i = middle + 1; i < form.len; i++) {
4628	0		auto found = prefixes_middle.at_typed(form.str + middle, i - middle);
4629	0	0	if (!found) break;
4630	0	0	if (unaligned_load(found)) {
4631	0	0	if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1);
		0
4632	0		middle_masks[i] \|= middle_masks[middle] & unaligned_load(found);
4633			}
4634			}
4635
4636			// Try matching word forms if at least one middle prefix was found.
4637	0	0	if (middle > initial && middle < form.len ) {
		0
4638	0	0	if (initial) {
4639	0	0	if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len);
4640	0		small_memcpy(form_tmp.data() + middle - initial, form.str, initial);
4641			}
4642	0		unsigned lemmas_ori_size = lemmas.size();
4643	0	0	dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas);
		0
4644			unsigned lemmas_new_size = lemmas_ori_size;
4645	0	0	for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) {
4646	0	0	for (unsigned filter = 0; filter < tag_filters.size(); filter++)
4647	0	0	if ((middle_masks[middle] & (1<
		0
		0
4648	0	0	if (i == lemmas_new_size) {
4649	0		lemmas[lemmas_new_size].lemma.insert(0, form.str + initial, middle - initial);
4650			} else {
4651	0	0	lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial);
4652	0		lemmas[lemmas_new_size].lemma.assign(form.str + initial, middle - initial);
4653	0		lemmas[lemmas_new_size].lemma.append(lemmas[i].lemma);
4654	0		lemmas[lemmas_new_size].tag = lemmas[i].tag;
4655			}
4656	0		lemmas_new_size++;
4657	0		break;
4658			}
4659			}
4660	0	0	if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end());
4661			}
4662			}
4663			}
4664			}
4665			}
4666
4667			template
4668			bool morpho_prefix_guesser::generate(string_piece /lemma/, const tag_filter& /filter/, vector& /lemmas_forms/) {
4669			// Not implemented yet. Is it actually needed?
4670			return false;
4671			}
4672			} // namespace morphodita
4673
4674			/////////
4675			// File: morphodita/morpho/morpho_statistical_guesser.h
4676			/////////
4677
4678			// This file is part of MorphoDiTa .
4679			//
4680			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4681			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4682			//
4683			// This Source Code Form is subject to the terms of the Mozilla Public
4684			// License, v. 2.0. If a copy of the MPL was not distributed with this
4685			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4686
4687			namespace morphodita {
4688
4689	1		class morpho_statistical_guesser {
4690			public:
4691			void load(binary_decoder& data);
4692			typedef vector used_rules;
4693			void analyze(string_piece form, vector& lemmas, used_rules* used);
4694
4695			private:
4696			vector tags;
4697			unsigned default_tag;
4698			persistent_unordered_map rules;
4699			};
4700
4701			} // namespace morphodita
4702
4703			/////////
4704			// File: morphodita/tokenizer/unicode_tokenizer.h
4705			/////////
4706
4707			// This file is part of MorphoDiTa .
4708			//
4709			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4710			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4711			//
4712			// This Source Code Form is subject to the terms of the Mozilla Public
4713			// License, v. 2.0. If a copy of the MPL was not distributed with this
4714			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4715
4716			namespace morphodita {
4717
4718	1		class unicode_tokenizer : public tokenizer {
4719			public:
4720			enum { URL_EMAIL_LATEST = 2 };
4721			unicode_tokenizer(unsigned url_email_tokenizer);
4722
4723			virtual void set_text(string_piece text, bool make_copy = false) override;
4724			virtual bool next_sentence(vector* forms, vector* tokens) override;
4725
4726			virtual bool next_sentence(vector& tokens) = 0;
4727
4728			protected:
4729			struct char_info {
4730			char32_t chr;
4731			unilib::unicode::category_t cat;
4732			const char* str;
4733
4734	36		char_info(char32_t chr, const char* str) : chr(chr), cat(unilib::unicode::category(chr)), str(str) {}
4735			};
4736			vector chars;
4737			size_t current;
4738
4739			bool tokenize_url_email(vector& tokens);
4740			bool emergency_sentence_split(const vector& tokens);
4741			bool is_eos(const vector& tokens, char32_t eos_chr, const unordered_set* abbreviations);
4742
4743			private:
4744			unsigned url_email_tokenizer;
4745			string text_buffer;
4746			vector tokens_buffer;
4747			string eos_buffer;
4748			};
4749
4750			} // namespace morphodita
4751
4752			/////////
4753			// File: morphodita/tokenizer/ragel_tokenizer.h
4754			/////////
4755
4756			// This file is part of MorphoDiTa .
4757			//
4758			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4759			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4760			//
4761			// This Source Code Form is subject to the terms of the Mozilla Public
4762			// License, v. 2.0. If a copy of the MPL was not distributed with this
4763			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4764
4765			namespace morphodita {
4766
4767	0		class ragel_tokenizer : public unicode_tokenizer {
4768			public:
4769			ragel_tokenizer(unsigned url_email_tokenizer);
4770
4771			protected:
4772			static inline uint8_t ragel_char(const char_info& chr);
4773
4774			private:
4775			static void initialize_ragel_map();
4776			static vector ragel_map;
4777			static atomic_flag ragel_map_flag;
4778			static void ragel_map_add(char32_t chr, uint8_t mapping);
4779
4780			friend class unicode_tokenizer;
4781			static bool ragel_url_email(unsigned version, const vector& chars, size_t& current_char, vector& tokens);
4782			};
4783
4784			uint8_t ragel_tokenizer::ragel_char(const char_info& chr) {
4785	30	50	return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27);
		100
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
4786			}
4787
4788			} // namespace morphodita
4789
4790			/////////
4791			// File: morphodita/tokenizer/czech_tokenizer.h
4792			/////////
4793
4794			// This file is part of MorphoDiTa .
4795			//
4796			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4797			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4798			//
4799			// This Source Code Form is subject to the terms of the Mozilla Public
4800			// License, v. 2.0. If a copy of the MPL was not distributed with this
4801			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4802
4803			namespace morphodita {
4804
4805	0		class czech_tokenizer : public ragel_tokenizer {
4806			public:
4807			enum tokenizer_language { CZECH = 0, SLOVAK = 1 };
4808			enum { LATEST = 2 };
4809			czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m = nullptr);
4810
4811			virtual bool next_sentence(vector& tokens) override;
4812
4813			private:
4814			const morpho* m;
4815			const unordered_set* abbreviations;
4816			vector lemmas;
4817
4818			void merge_hyphenated(vector& tokens);
4819
4820			static const unordered_set abbreviations_czech;
4821			static const unordered_set abbreviations_slovak;
4822			};
4823
4824			} // namespace morphodita
4825
4826			/////////
4827			// File: morphodita/morpho/czech_morpho.h
4828			/////////
4829
4830			// This file is part of MorphoDiTa .
4831			//
4832			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4833			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4834			//
4835			// This Source Code Form is subject to the terms of the Mozilla Public
4836			// License, v. 2.0. If a copy of the MPL was not distributed with this
4837			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4838
4839			namespace morphodita {
4840
4841	0		class czech_morpho : public morpho {
4842			public:
4843			using morpho_language = czech_tokenizer::tokenizer_language;
4844
4845	0	0	czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {}
		0
		0
4846
4847			virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override;
4848			virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override;
4849			virtual int raw_lemma_len(string_piece lemma) const override;
4850			virtual int lemma_id_len(string_piece lemma) const override;
4851			virtual int raw_form_len(string_piece form) const override;
4852			virtual tokenizer* new_tokenizer() const override;
4853
4854			bool load(istream& is);
4855			private:
4856			inline void analyze_special(string_piece form, vector& lemmas) const;
4857
4858			morpho_language language;
4859			unsigned version;
4860			morpho_dictionary dictionary;
4861			unique_ptr> prefix_guesser;
4862			unique_ptr statistical_guesser;
4863
4864			string unknown_tag = "X@-------------";
4865			string number_tag = "C=-------------";
4866			string punctuation_tag = "Z:-------------";
4867			};
4868
4869			} // namespace morphodita
4870
4871			/////////
4872			// File: morphodita/morpho/czech_morpho.cpp
4873			/////////
4874
4875			// This file is part of MorphoDiTa .
4876			//
4877			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
4878			// Mathematics and Physics, Charles University in Prague, Czech Republic.
4879			//
4880			// This Source Code Form is subject to the terms of the Mozilla Public
4881			// License, v. 2.0. If a copy of the MPL was not distributed with this
4882			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4883
4884			namespace morphodita {
4885
4886	0		bool czech_morpho::load(istream& is) {
4887			binary_decoder data;
4888	0	0	if (!compressor::load(is, data)) return false;
		0
4889
4890			try {
4891			// Load tag length
4892	0	0	unsigned tag_length = data.next_1B();
4893	0	0	if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length);
		0
4894	0	0	if (tag_length < number_tag.size()) number_tag.erase(tag_length);
		0
4895	0	0	if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length);
		0
4896
4897			// Load dictionary
4898	0	0	dictionary.load(data);
4899
4900			// Optionally prefix guesser if present
4901	0		prefix_guesser.reset();
4902	0	0	if (data.next_1B()) {
		0
4903	0	0	prefix_guesser.reset(new morpho_prefix_guesser(dictionary));
4904	0	0	prefix_guesser->load(data);
4905			}
4906
4907			// Optionally statistical guesser if present
4908			statistical_guesser.reset();
4909	0	0	if (data.next_1B()) {
		0
4910	0	0	statistical_guesser.reset(new morpho_statistical_guesser());
4911	0	0	statistical_guesser->load(data);
4912		0	}
4913			} catch (binary_decoder_error&) {
4914			return false;
4915			}
4916
4917	0		return data.is_end();
4918			}
4919
4920	0		int czech_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const {
4921			lemmas.clear();
4922
4923	0	0	if (form.len) {
4924			// Generate all casing variants if needed (they are different than given form).
4925			string form_uclc; // first uppercase, rest lowercase
4926			string form_lc; // all lowercase
4927	0	0	generate_casing_variants(form, form_uclc, form_lc);
4928
4929			// Start by analysing using the dictionary and all casing variants.
4930	0	0	dictionary.analyze(form, lemmas);
4931	0	0	if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
		0
4932	0	0	if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
		0
4933	0	0	if (!lemmas.empty()) return NO_GUESSER;
4934
4935			// Then call analyze_special to handle numbers and punctuation.
4936	0	0	analyze_special(form, lemmas);
4937	0	0	if (!lemmas.empty()) return NO_GUESSER;
4938
4939			// For the prefix guesser, use only form_lc.
4940	0	0	if (guesser == GUESSER && prefix_guesser)
		0
		0
4941	0	0	prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas);
		0
4942			bool prefix_guesser_guesses = !lemmas.empty();
4943
4944			// For the statistical guesser, use all casing variants.
4945	0	0	if (guesser == GUESSER && statistical_guesser) {
		0
		0
4946	0	0	if (form_uclc.empty() && form_lc.empty())
		0
		0
4947	0	0	statistical_guesser->analyze(form, lemmas, nullptr);
4948			else {
4949	0	0	morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3);
4950	0	0	statistical_guesser->analyze(form, lemmas, &used_rules);
4951	0	0	if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
		0
4952	0	0	if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
		0
4953			}
4954			}
4955
4956			// Make sure results are unique lemma-tag pairs. Statistical guesser produces
4957			// unique lemma-tag pairs, but prefix guesser does not.
4958	0	0	if (prefix_guesser_guesses) {
4959	0		sort(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) {
4960	0		int lemma_compare = a.lemma.compare(b.lemma);
4961	0	0	return lemma_compare < 0 \|\| (lemma_compare == 0 && a.tag < b.tag);
4962			});
4963	0		auto lemmas_end = unique(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) {
4964	0	0	return a.lemma == b.lemma && a.tag == b.tag;
		0
4965	0		});
4966	0	0	if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end());
4967			}
4968
4969	0	0	if (!lemmas.empty()) return GUESSER;
4970			}
4971
4972	0	0	lemmas.emplace_back(string(form.str, form.len), unknown_tag);
4973	0		return -1;
4974			}
4975
4976	0		int czech_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode guesser, vector& forms) const {
4977			forms.clear();
4978
4979	0		tag_filter filter(tag_wildcard);
4980
4981	0	0	if (lemma.len) {
4982	0	0	if (dictionary.generate(lemma, filter, forms))
		0
4983			return NO_GUESSER;
4984
4985	0	0	if (guesser == GUESSER && prefix_guesser)
		0
4986			if (prefix_guesser->generate(lemma, filter, forms))
4987			return GUESSER;
4988			}
4989
4990			return -1;
4991			}
4992
4993	0		int czech_morpho::raw_lemma_len(string_piece lemma) const {
4994	0		return czech_lemma_addinfo::raw_lemma_len(lemma);
4995			}
4996
4997	0		int czech_morpho::lemma_id_len(string_piece lemma) const {
4998	0		return czech_lemma_addinfo::lemma_id_len(lemma);
4999			}
5000
5001	0		int czech_morpho::raw_form_len(string_piece form) const {
5002	0		return form.len;
5003			}
5004
5005	0		tokenizer* czech_morpho::new_tokenizer() const {
5006	0	0	return new czech_tokenizer(language, version, this);
5007			}
5008
5009			// What characters are considered punctuation except for the ones in unicode Punctuation category.
5010			static bool punctuation_additional[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/$/,
5011			0,0,0,0,0,0,1/+/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/</,1/=/,1/>/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5012			0,0,0,0,0,0,0,0,1/^/,0,1/`/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/\|/,0,1/~/,0,0,0,0,0,0,0,0,
5013			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5014			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5015			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5016			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5017			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5018			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5019			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5020			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5021			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5022			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/caron/};
5023
5024			// What characters of unicode Punctuation category are not considered punctuation.
5025			static bool punctuation_exceptions[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5026			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5027			0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
5028			0,0,0,0,0,0,0,0,0,1/paragraph/};
5029
5030	0		void czech_morpho::analyze_special(string_piece form, vector& lemmas) const {
5031			using namespace unilib;
5032
5033			// Analyzer for numbers and punctuation.
5034			// Number is anything matching [+-]? is_Pn* ([.,] is_Pn)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn nonempty.
5035			// Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character.
5036			// Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number.
5037	0	0	if (!form.len) return;
5038
5039	0		string_piece form_ori = form;
5040	0		char32_t first = utf8::decode(form.str, form.len);
5041
5042			// Try matching a number.
5043			char32_t codepoint = first;
5044			bool any_digit = false;
5045	0	0	if (codepoint == '+' \|\| codepoint == '-') codepoint = utf8::decode(form.str, form.len);
5046	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
5047	0	0	if ((codepoint == '.' && form.len) \|\| codepoint == ',') codepoint = utf8::decode(form.str, form.len);
		0
		0
5048	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
5049	0	0	if (any_digit && (codepoint == 'e' \|\| codepoint == 'E')) {
		0
5050	0		codepoint = utf8::decode(form.str, form.len);
5051	0	0	if (codepoint == '+' \|\| codepoint == '-') codepoint = utf8::decode(form.str, form.len);
5052			any_digit = false;
5053	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len);
5054			}
5055
5056	0	0	if (any_digit && !form.len && (!codepoint \|\| codepoint == '.')) {
		0
		0
5057	0	0	lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag);
5058	0	0	} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) \|\|
		0
		0
		0
5059	0	0	((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) \|\| !punctuation_exceptions[first])))
		0
5060	0	0	lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag);
5061			}
5062
5063			} // namespace morphodita
5064
5065			/////////
5066			// File: morphodita/morpho/english_lemma_addinfo.h
5067			/////////
5068
5069			// This file is part of MorphoDiTa .
5070			//
5071			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5072			// Mathematics and Physics, Charles University in Prague, Czech Republic.
5073			//
5074			// This Source Code Form is subject to the terms of the Mozilla Public
5075			// License, v. 2.0. If a copy of the MPL was not distributed with this
5076			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
5077
5078			namespace morphodita {
5079
5080			// Declarations
5081	0		struct english_lemma_addinfo {
5082			inline static int raw_lemma_len(string_piece lemma);
5083			inline static int lemma_id_len(string_piece lemma);
5084			inline static string format(const unsigned char* addinfo, int addinfo_len);
5085			inline static bool generatable(const unsigned char* addinfo, int addinfo_len);
5086
5087			inline int parse(string_piece lemma, bool die_on_failure = false);
5088			inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len);
5089
5090			vector data;
5091			};
5092
5093			// Definitions
5094	0		int english_lemma_addinfo::raw_lemma_len(string_piece lemma) {
5095			// Lemma ends either by
5096			// - '^' on non-first position followed by nothing or [A-Za-z][-A-Za-z]*
5097			// - '+' on non-first position followed by nothing
5098	0	0	for (unsigned len = 1; len < lemma.len; len++) {
5099	0	0	if (len + 1 == lemma.len && (lemma.str[len] == '^' \|\| lemma.str[len] == '+'))
		0
5100	0		return len;
5101	0	0	if (len + 1 < lemma.len && lemma.str[len] == '^') {
		0
5102			bool ok = true;
5103	0	0	for (unsigned i = len + 1; ok && i < lemma.len; i++)
		0
5104	0	0	ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') \|\|
5105	0	0	(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') \|\|
		0
5106	0	0	(i > len + 1 && lemma.str[i] == '-');
5107	0	0	if (ok) return len;
5108			}
5109			}
5110	0		return lemma.len;
5111			}
5112
5113			int english_lemma_addinfo::lemma_id_len(string_piece lemma) {
5114			// No lemma comments.
5115	0		return lemma.len;
5116			}
5117
5118			string english_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) {
5119	0		return string((const char*) addinfo, addinfo_len);
5120			}
5121
5122			bool english_lemma_addinfo::generatable(const unsigned char* /addinfo/, int /addinfo_len/) {
5123			return true;
5124			}
5125
5126	0		int english_lemma_addinfo::parse(string_piece lemma, bool /die_on_failure/) {
5127			data.clear();
5128
5129	0		size_t len = raw_lemma_len(lemma);
5130	0	0	for (size_t i = len; i < lemma.len; i++)
5131	0		data.push_back(lemma.str[i]);
5132
5133	0		return len;
5134			}
5135
5136	0		bool english_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) {
5137	0	0	if (data.empty()) return true;
5138	0	0	if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^';
		0
		0
		0
		0
5139	0	0	if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0;
		0
		0
5140	0	0	return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len);
		0
5141			}
5142
5143			} // namespace morphodita
5144
5145			/////////
5146			// File: morphodita/morpho/english_morpho_guesser.h
5147			/////////
5148
5149			// This file is part of MorphoDiTa .
5150			//
5151			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5152			// Mathematics and Physics, Charles University in Prague, Czech Republic.
5153			//
5154			// This Source Code Form is subject to the terms of the Mozilla Public
5155			// License, v. 2.0. If a copy of the MPL was not distributed with this
5156			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
5157
5158			namespace morphodita {
5159
5160	0	0	class english_morpho_guesser {
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
5161			public:
5162			void load(binary_decoder& data);
5163			void analyze(string_piece form, string_piece form_lc, vector& lemmas) const;
5164			bool analyze_proper_names(string_piece form, string_piece form_lc, vector& lemmas) const;
5165
5166			private:
5167			inline void add(const string& tag, const string& form, vector& lemmas) const;
5168			inline void add(const string& tag, const string& tag2, const string& form, vector& lemmas) const;
5169			inline void add(const string& tag, const string& form, unsigned negation_len, vector& lemmas) const;
5170			inline void add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector& lemmas) const;
5171			void add_NNS(const string& form, unsigned negation_len, vector& lemmas) const;
5172			void add_NNPS(const string& form, vector& lemmas) const;
5173			void add_VBG(const string& form, vector& lemmas) const;
5174			void add_VBD_VBN(const string& form, vector& lemmas) const;
5175			void add_VBZ(const string& form, vector& lemmas) const;
5176			void add_JJR_RBR(const string& form, unsigned negation_len, vector& lemmas) const;
5177			void add_JJS_RBS(const string& form, unsigned negation_len, vector& lemmas) const;
5178
5179			enum { NEGATION_LEN = 0, TO_FOLLOW = 1, TOTAL = 2 };
5180			vector exceptions_tags;
5181			persistent_unordered_map exceptions;
5182			persistent_unordered_map negations;
5183			string CD = "CD", FW = "FW", JJ = "JJ", JJR = "JJR", JJS = "JJS",
5184			NN = "NN", NNP = "NNP", NNPS = "NNPS", NNS = "NNS", RB = "RB",
5185			RBR = "RBR", RBS = "RBS", SYM = "SYM", VB = "VB", VBD = "VBD",
5186			VBG = "VBG", VBN = "VBN", VBP = "VBP", VBZ = "VBZ";
5187			};
5188
5189			} // namespace morphodita
5190
5191			/////////
5192			// File: morphodita/morpho/english_morpho.h
5193			/////////
5194
5195			// This file is part of MorphoDiTa .
5196			//
5197			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5198			// Mathematics and Physics, Charles University in Prague, Czech Republic.
5199			//
5200			// This Source Code Form is subject to the terms of the Mozilla Public
5201			// License, v. 2.0. If a copy of the MPL was not distributed with this
5202			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
5203
5204			namespace morphodita {
5205
5206	0		class english_morpho : public morpho {
5207			public:
5208	0	0	english_morpho(unsigned version) : version(version) {}
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
5209
5210			virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override;
5211			virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override;
5212			virtual int raw_lemma_len(string_piece lemma) const override;
5213			virtual int lemma_id_len(string_piece lemma) const override;
5214			virtual int raw_form_len(string_piece form) const override;
5215			virtual tokenizer* new_tokenizer() const override;
5216
5217			bool load(istream& is);
5218			private:
5219			inline void analyze_special(string_piece form, vector& lemmas) const;
5220
5221			unsigned version;
5222			morpho_dictionary dictionary;
5223			english_morpho_guesser morpho_guesser;
5224
5225			string unknown_tag = "UNK";
5226			string number_tag = "CD", nnp_tag = "NNP", ls_tag = "LS";
5227			string open_quotation_tag = "``", close_quotation_tag = "''";
5228			string open_parenthesis_tag = "(", close_parenthesis_tag = ")";
5229			string comma_tag = ",", dot_tag = ".", punctuation_tag = ":", hash_tag = "#", dollar_tag = "$";
5230			string sym_tag = "SYM", jj_tag = "JJ", nn_tag = "NN", nns_tag = "NNS", cc_tag = "CC", pos_tag = "POS", in_tag = "IN";
5231			};
5232
5233			} // namespace morphodita
5234
5235			/////////
5236			// File: morphodita/tokenizer/english_tokenizer.h
5237			/////////
5238
5239			// This file is part of MorphoDiTa .
5240			//
5241			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5242			// Mathematics and Physics, Charles University in Prague, Czech Republic.
5243			//
5244			// This Source Code Form is subject to the terms of the Mozilla Public
5245			// License, v. 2.0. If a copy of the MPL was not distributed with this
5246			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
5247
5248			namespace morphodita {
5249
5250	0		class english_tokenizer : public ragel_tokenizer {
5251			public:
5252			enum { LATEST = 2 };
5253			english_tokenizer(unsigned version);
5254
5255			virtual bool next_sentence(vector& tokens) override;
5256
5257			private:
5258			void split_token(vector& tokens);
5259
5260			static const unordered_set abbreviations;
5261			};
5262
5263			} // namespace morphodita
5264
5265			/////////
5266			// File: morphodita/morpho/english_morpho.cpp
5267			/////////
5268
5269			// This file is part of MorphoDiTa .
5270			//
5271			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5272			// Mathematics and Physics, Charles University in Prague, Czech Republic.
5273			//
5274			// This Source Code Form is subject to the terms of the Mozilla Public
5275			// License, v. 2.0. If a copy of the MPL was not distributed with this
5276			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
5277
5278			namespace morphodita {
5279
5280	0		bool english_morpho::load(istream& is) {
5281			binary_decoder data;
5282	0	0	if (!compressor::load(is, data)) return false;
		0
5283
5284			try {
5285	0	0	dictionary.load(data);
5286	0	0	morpho_guesser.load(data);
		0
5287			} catch (binary_decoder_error&) {
5288			return false;
5289			}
5290
5291	0		return data.is_end();
5292			}
5293
5294	0		int english_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const {
5295			lemmas.clear();
5296
5297	0	0	if (form.len) {
5298			// Generate all casing variants if needed (they are different than given form).
5299			string form_uclc; // first uppercase, rest lowercase
5300			string form_lc; // all lowercase
5301	0	0	generate_casing_variants(form, form_uclc, form_lc);
5302
5303			// Start by analysing using the dictionary and all casing variants.
5304	0	0	dictionary.analyze(form, lemmas);
5305	0	0	if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
		0
5306	0	0	if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
		0
5307	0	0	if (!lemmas.empty())
5308	0	0	return guesser == NO_GUESSER \|\| !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER;
		0
		0
		0
5309
5310			// Then call analyze_special to handle numbers, punctuation and symbols.
5311	0	0	analyze_special(form, lemmas);
5312	0	0	if (!lemmas.empty()) return NO_GUESSER;
5313
5314			// Use English guesser on form_lc if allowed.
5315	0	0	if (guesser == GUESSER)
5316	0	0	morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas);
		0
5317	0	0	if (!lemmas.empty()) return GUESSER;
5318			}
5319
5320	0	0	lemmas.emplace_back(string(form.str, form.len), unknown_tag);
5321	0		return -1;
5322			}
5323
5324	0		int english_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /guesser/, vector& forms) const {
5325			forms.clear();
5326
5327	0		tag_filter filter(tag_wildcard);
5328
5329	0	0	if (lemma.len) {
5330	0	0	if (dictionary.generate(lemma, filter, forms))
		0
5331			return NO_GUESSER;
5332			}
5333
5334			return -1;
5335			}
5336
5337	0		int english_morpho::raw_lemma_len(string_piece lemma) const {
5338	0		return english_lemma_addinfo::raw_lemma_len(lemma);
5339			}
5340
5341	0		int english_morpho::lemma_id_len(string_piece lemma) const {
5342	0		return english_lemma_addinfo::lemma_id_len(lemma);
5343			}
5344
5345	0		int english_morpho::raw_form_len(string_piece form) const {
5346	0		return form.len;
5347			}
5348
5349	0		tokenizer* english_morpho::new_tokenizer() const {
5350	0	0	return new english_tokenizer(version <= 2 ? 1 : 2);
5351			}
5352
5353	0		void english_morpho::analyze_special(string_piece form, vector& lemmas) const {
5354			using namespace unilib;
5355
5356			// Analyzer for numbers and punctuation.
5357	0	0	if (!form.len) return;
5358
5359			// One-letter punctuation exceptions.
5360	0	0	if (form.len == 1)
5361	0		switch(*form.str) {
5362			case '.':
5363			case '!':
5364	0	0	case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return;
5365	0	0	case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return;
5366	0	0	case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return;
5367	0	0	case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return;
5368	0	0	case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
5369	0	0	case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
5370	0	0	case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag);
5371	0	0	lemmas.emplace_back(string(form.str, form.len), nn_tag); return;
5372	0	0	case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag);
5373	0	0	lemmas.emplace_back(string(form.str, form.len), sym_tag); return;
5374	0	0	case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag);
5375	0	0	lemmas.emplace_back(string(form.str, form.len), nn_tag); return;
5376	0	0	case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag);
5377	0	0	lemmas.emplace_back(string(form.str, form.len), in_tag); return;
5378	0	0	case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag);
5379	0	0	lemmas.emplace_back(string(form.str, form.len), pos_tag); return;
5380			}
5381
5382			// Try matching a number: [+-]? is_Pn* (, is_Pn{3})? (. is_Pn*)? (s \| [Ee] [+-]? is_Pn+)? with at least one digit
5383	0		string_piece number = form;
5384	0		char32_t codepoint = utf8::decode(number.str, number.len);
5385			bool any_digit = false;
5386	0	0	if (codepoint == '+' \|\| codepoint == '-') codepoint = utf8::decode(number.str, number.len);
5387	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5388	0	0	while (codepoint == ',') {
5389	0		string_piece group = number;
5390	0	0	if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5391	0	0	if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5392	0	0	if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break;
5393			any_digit = true;
5394	0		number = group;
5395	0		codepoint = utf8::decode(number.str, number.len);
5396			}
5397	0	0	if (codepoint == '.' && number.len) {
		0
5398	0		codepoint = utf8::decode(number.str, number.len);
5399	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5400			}
5401	0	0	if (version >= 2 && any_digit && codepoint == 's' && !number.len) {
		0
		0
5402	0	0	lemmas.emplace_back(string(form.str, form.len), number_tag);
5403	0	0	lemmas.emplace_back(string(form.str, form.len - 1), nns_tag);
5404	0		return;
5405			}
5406	0	0	if (any_digit && (codepoint == 'e' \|\| codepoint == 'E')) {
		0
5407	0		codepoint = utf8::decode(number.str, number.len);
5408	0	0	if (codepoint == '+' \|\| codepoint == '-') codepoint = utf8::decode(number.str, number.len);
5409			any_digit = false;
5410	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
5411			}
5412	0	0	if (any_digit && !number.len && (!codepoint \|\| codepoint == '.')) {
		0
		0
5413	0	0	lemmas.emplace_back(string(form.str, form.len), number_tag);
5414	0	0	lemmas.emplace_back(string(form.str, form.len), nnp_tag);
5415	0	0	if (form.len == 1 + (codepoint == '.') && form.str >= '1' && form.str <= '9')
		0
		0
		0
5416	0	0	lemmas.emplace_back(string(form.str, form.len), ls_tag);
5417			return;
5418			}
5419
5420			// Open quotation, end quotation, open parentheses, end parentheses, symbol, or other
5421	0		string_piece punctuation = form;
5422			bool open_quotation = true, close_quotation = true, open_parenthesis = true, close_parenthesis = true, any_punctuation = true, symbol = true;
5423	0	0	while ((symbol \|\| any_punctuation) && punctuation.len) {
		0
5424	0		codepoint = utf8::decode(punctuation.str, punctuation.len);
5425	0	0	if (open_quotation) open_quotation = codepoint == '`' \|\| unicode::category(codepoint) & unicode::Pi;
		0
		0
5426	0	0	if (close_quotation) close_quotation = codepoint == '\'' \|\| codepoint == '"' \|\| unicode::category(codepoint) & unicode::Pf;
		0
		0
5427	0	0	if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps;
5428	0	0	if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe;
5429	0	0	if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P;
5430	0	0	if (symbol) symbol = codepoint == '*' \|\| unicode::category(codepoint) & unicode::S;
		0
		0
5431			}
5432	0	0	if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; }
		0
		0
5433	0	0	if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; }
		0
		0
5434	0	0	if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; }
		0
		0
5435	0	0	if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; }
		0
		0
5436	0	0	if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; }
		0
		0
5437	0	0	if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; }
		0
		0
5438			}
5439
5440			} // namespace morphodita
5441
5442			/////////
5443			// File: morphodita/morpho/english_morpho_guesser.cpp
5444			/////////
5445
5446			// This file is part of MorphoDiTa .
5447			//
5448			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
5449			// Mathematics and Physics, Charles University in Prague, Czech Republic.
5450			//
5451			// This Source Code Form is subject to the terms of the Mozilla Public
5452			// License, v. 2.0. If a copy of the MPL was not distributed with this
5453			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
5454
5455			// This code is a reimplementation of morphologic analyzer Morphium
5456			// by Johanka Spoustova (Treex::Tool::EnglishMorpho::Analysis Perl module)
5457			// and reimplementation of morphologic lemmatizer by Martin Popel
5458			// (Treex::Tool::EnglishMorpho::Lemmatizer Perl module). The latter is based
5459			// on morpha:
5460			// Minnen, G., J. Carroll and D. Pearce (2001). Applied morphological
5461			// processing of English, Natural Language Engineering, 7(3). 207-223.
5462			// Morpha has been released under LGPL as a part of RASP system
5463			// http://ilexir.co.uk/applications/rasp/.
5464
5465			namespace morphodita {
5466
5467	0		void english_morpho_guesser::load(binary_decoder& data) {
5468	0		unsigned tags = data.next_2B();
5469	0		exceptions_tags.clear();
5470	0		exceptions_tags.reserve(tags);
5471	0	0	while (tags--) {
5472	0		unsigned len = data.next_1B();
5473	0	0	exceptions_tags.emplace_back(string(data.next(len), len));
5474			}
5475
5476	0		exceptions.load(data);
5477	0		negations.load(data);
5478	0		}
5479
5480			static const char _tag_guesser_actions[] = {
5481			0, 1, 0, 1, 1, 1, 2, 1,
5482			3, 1, 4, 1, 5, 1, 6, 1,
5483			7, 2, 2, 6, 2, 2, 7, 2,
5484			4, 6, 2, 4, 7, 2, 5, 6,
5485			2, 5, 7, 2, 6, 7, 3, 2,
5486			6, 7, 3, 4, 6, 7, 3, 5,
5487			6, 7
5488			};
5489
5490			static const unsigned char _tag_guesser_key_offsets[] = {
5491			0, 19, 26, 34, 42, 50, 58, 66,
5492			74, 82, 90, 100, 108, 116, 124, 132,
5493			145, 153, 161, 168, 179, 195, 212, 220,
5494			228, 236
5495			};
5496
5497			static const char _tag_guesser_trans_keys[] = {
5498			45, 46, 99, 100, 103, 105, 109, 110,
5499			114, 115, 116, 118, 120, 48, 57, 65,
5500			90, 97, 122, 45, 48, 57, 65, 90,
5501			97, 122, 45, 114, 48, 57, 65, 90,
5502			97, 122, 45, 111, 48, 57, 65, 90,
5503			97, 122, 45, 109, 48, 57, 65, 90,
5504			97, 122, 45, 101, 48, 57, 65, 90,
5505			97, 122, 45, 115, 48, 57, 65, 90,
5506			97, 122, 45, 101, 48, 57, 65, 90,
5507			97, 122, 45, 108, 48, 57, 65, 90,
5508			97, 122, 45, 115, 48, 57, 65, 90,
5509			97, 122, 45, 97, 101, 111, 48, 57,
5510			65, 90, 98, 122, 45, 101, 48, 57,
5511			65, 90, 97, 122, 45, 108, 48, 57,
5512			65, 90, 97, 122, 45, 109, 48, 57,
5513			65, 90, 97, 122, 45, 105, 48, 57,
5514			65, 90, 97, 122, 45, 97, 101, 105,
5515			111, 117, 121, 48, 57, 65, 90, 98,
5516			122, 45, 115, 48, 57, 65, 90, 97,
5517			122, 45, 101, 48, 57, 65, 90, 97,
5518			122, 45, 48, 57, 65, 90, 97, 122,
5519			45, 101, 114, 115, 116, 48, 57, 65,
5520			90, 97, 122, 45, 46, 105, 109, 118,
5521			120, 48, 57, 65, 90, 97, 98, 99,
5522			100, 101, 122, 45, 46, 101, 105, 109,
5523			118, 120, 48, 57, 65, 90, 97, 98,
5524			99, 100, 102, 122, 45, 110, 48, 57,
5525			65, 90, 97, 122, 45, 105, 48, 57,
5526			65, 90, 97, 122, 45, 101, 48, 57,
5527			65, 90, 97, 122, 45, 115, 48, 57,
5528			65, 90, 97, 122, 0
5529			};
5530
5531			static const char _tag_guesser_single_lengths[] = {
5532			13, 1, 2, 2, 2, 2, 2, 2,
5533			2, 2, 4, 2, 2, 2, 2, 7,
5534			2, 2, 1, 5, 6, 7, 2, 2,
5535			2, 2
5536			};
5537
5538			static const char _tag_guesser_range_lengths[] = {
5539			3, 3, 3, 3, 3, 3, 3, 3,
5540			3, 3, 3, 3, 3, 3, 3, 3,
5541			3, 3, 3, 3, 5, 5, 3, 3,
5542			3, 3
5543			};
5544
5545			static const unsigned char _tag_guesser_index_offsets[] = {
5546			0, 17, 22, 28, 34, 40, 46, 52,
5547			58, 64, 70, 78, 84, 90, 96, 102,
5548			113, 119, 125, 130, 139, 151, 164, 170,
5549			176, 182
5550			};
5551
5552			static const char _tag_guesser_indicies[] = {
5553			1, 2, 5, 6, 7, 5, 5, 8,
5554			9, 10, 11, 5, 5, 3, 4, 4,
5555			0, 13, 14, 15, 15, 12, 13, 16,
5556			14, 15, 15, 12, 13, 17, 14, 15,
5557			15, 12, 13, 18, 14, 15, 15, 12,
5558			13, 18, 14, 15, 15, 12, 13, 19,
5559			14, 15, 15, 12, 13, 20, 14, 15,
5560			15, 12, 13, 18, 14, 15, 15, 12,
5561			13, 21, 14, 15, 15, 12, 13, 22,
5562			23, 24, 14, 15, 15, 12, 13, 25,
5563			14, 15, 15, 12, 13, 23, 14, 15,
5564			15, 12, 13, 23, 14, 15, 15, 12,
5565			13, 26, 14, 15, 15, 12, 28, 15,
5566			15, 15, 15, 15, 15, 29, 26, 26,
5567			27, 31, 4, 32, 33, 33, 30, 13,
5568			23, 14, 15, 15, 12, 13, 14, 15,
5569			15, 12, 13, 34, 35, 36, 37, 14,
5570			15, 15, 12, 13, 38, 39, 39, 39,
5571			39, 14, 15, 15, 39, 15, 12, 13,
5572			38, 40, 39, 39, 39, 39, 14, 15,
5573			15, 39, 15, 12, 13, 41, 14, 15,
5574			15, 12, 13, 42, 14, 15, 15, 12,
5575			13, 18, 14, 15, 15, 12, 13, 43,
5576			14, 15, 15, 12, 0
5577			};
5578
5579			static const char _tag_guesser_trans_targs[] = {
5580			18, 19, 20, 18, 18, 20, 21, 22,
5581			23, 24, 16, 25, 18, 19, 18, 1,
5582			3, 4, 18, 7, 8, 10, 11, 18,
5583			13, 12, 18, 18, 19, 18, 18, 19,
5584			18, 18, 2, 5, 6, 9, 20, 20,
5585			18, 14, 15, 17
5586			};
5587
5588			static const char _tag_guesser_trans_actions[] = {
5589			29, 46, 29, 32, 11, 11, 11, 11,
5590			11, 11, 0, 11, 13, 35, 15, 0,
5591			0, 0, 1, 0, 0, 0, 0, 3,
5592			0, 0, 5, 17, 38, 20, 23, 42,
5593			26, 9, 0, 0, 0, 0, 13, 0,
5594			7, 0, 0, 0
5595			};
5596
5597			static const char _tag_guesser_eof_actions[] = {
5598			0, 0, 0, 0, 0, 0, 0, 0,
5599			0, 0, 0, 0, 0, 0, 0, 0,
5600			0, 0, 0, 0, 15, 15, 0, 0,
5601			0, 0
5602			};
5603
5604			static const int tag_guesser_start = 0;
5605
5606	0		void english_morpho_guesser::analyze(string_piece form, string_piece form_lc, vector& lemmas) const {
5607			// Try exceptions list
5608	0		auto* exception = exceptions.at(form_lc.str, form_lc.len, [](pointer_decoder& data){
5609	0	0	for (unsigned len = data.next_1B(); len; len--) {
5610			data.next(data.next_1B());
5611			data.next(data.next_1B());
5612			}
5613	0		});
5614
5615	0	0	if (exception) {
5616			// Found in exceptions list
5617			pointer_decoder data(exception);
5618	0	0	for (unsigned len = data.next_1B(); len; len--) {
5619			unsigned lemma_len = data.next_1B();
5620	0		string lemma(data.next(lemma_len), lemma_len);
5621	0	0	for (unsigned tags = data.next_1B(); tags; tags--)
5622	0	0	lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]);
5623			}
5624			} else {
5625			// Try stripping negative prefix and use rule guesser
5626			string lemma_lc(form_lc.str, form_lc.len);
5627			// Try finding negative prefix
5628			unsigned negation_len = 0;
5629	0	0	for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) {
5630	0		auto found = negations.at(form_lc.str, prefix, [](pointer_decoder& data){ data.next(TOTAL); });
5631	0	0	if (!found) break;
5632	0	0	if (found[NEGATION_LEN]) {
5633	0	0	if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN];
5634			}
5635			}
5636
5637			// Add default tags
5638	0		add(FW, lemma_lc, lemmas);
5639	0	0	add(JJ, lemma_lc, negation_len, lemmas);
5640	0	0	add(RB, lemma_lc, negation_len, lemmas);
5641	0	0	add(NN, lemma_lc, negation_len, lemmas);
5642	0	0	add_NNS(lemma_lc, negation_len, lemmas);
5643
5644			// Add specialized tags
5645			const char* p = form_lc.str; int cs;
5646			bool added_JJR_RBR = false, added_JJS_RBS = false, added_SYM = false, added_CD = false;
5647
5648			{
5649			cs = tag_guesser_start;
5650			}
5651
5652			{
5653			int _klen;
5654			unsigned int _trans;
5655			const char *_acts;
5656			unsigned int _nacts;
5657			const char *_keys;
5658
5659	0	0	if ( p == ( (form_lc.str + form_lc.len)) )
5660			goto _test_eof;
5661			_resume:
5662	0		_keys = _tag_guesser_trans_keys + _tag_guesser_key_offsets[cs];
5663	0		_trans = _tag_guesser_index_offsets[cs];
5664
5665	0		_klen = _tag_guesser_single_lengths[cs];
5666	0	0	if ( _klen > 0 ) {
5667			const char *_lower = _keys;
5668			const char *_mid;
5669	0		const char *_upper = _keys + _klen - 1;
5670			while (1) {
5671	0	0	if ( _upper < _lower )
5672			break;
5673
5674	0		_mid = _lower + ((_upper-_lower) >> 1);
5675	0	0	if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid )
5676	0		_upper = _mid - 1;
5677	0	0	else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid )
5678	0		_lower = _mid + 1;
5679			else {
5680	0		_trans += (unsigned int)(_mid - _keys);
5681	0		goto _match;
5682			}
5683			}
5684	0		_keys += _klen;
5685	0		_trans += _klen;
5686			}
5687
5688	0		_klen = _tag_guesser_range_lengths[cs];
5689	0	0	if ( _klen > 0 ) {
5690			const char *_lower = _keys;
5691			const char *_mid;
5692	0		const char *_upper = _keys + (_klen<<1) - 2;
5693			while (1) {
5694	0	0	if ( _upper < _lower )
5695			break;
5696
5697	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
5698	0	0	if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] )
5699	0		_upper = _mid - 2;
5700	0	0	else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] )
5701	0		_lower = _mid + 2;
5702			else {
5703	0		_trans += (unsigned int)((_mid - _keys)>>1);
5704	0		goto _match;
5705			}
5706			}
5707	0		_trans += _klen;
5708			}
5709
5710			_match:
5711	0		_trans = _tag_guesser_indicies[_trans];
5712	0		cs = _tag_guesser_trans_targs[_trans];
5713
5714	0	0	if ( _tag_guesser_trans_actions[_trans] == 0 )
5715			goto _again;
5716
5717	0		_acts = _tag_guesser_actions + _tag_guesser_trans_actions[_trans];
5718	0		_nacts = (unsigned int) *_acts++;
5719	0	0	while ( _nacts-- > 0 )
5720			{
5721	0		switch ( *_acts++ )
5722			{
5723			case 0:
5724	0	0	{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); }
		0
5725			break;
5726			case 1:
5727	0	0	{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); }
		0
5728			break;
5729			case 2:
5730	0	0	{ add_VBG(lemma_lc, lemmas); }
5731			break;
5732			case 3:
5733	0	0	{ add_VBD_VBN(lemma_lc, lemmas); }
5734			break;
5735			case 4:
5736	0	0	{ add_VBZ(lemma_lc, lemmas); }
5737			break;
5738			case 5:
5739	0		{ add(VB, lemma_lc, lemmas); add(VBP, lemma_lc, lemmas); }
5740			break;
5741			case 6:
5742	0	0	{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); }
5743			break;
5744			case 7:
5745	0	0	{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); }
5746			break;
5747			}
5748			}
5749
5750			_again:
5751	0	0	if ( ++p != ( (form_lc.str + form_lc.len)) )
5752			goto _resume;
5753			_test_eof: {}
5754	0	0	if ( p == ( (form_lc.str + form_lc.len)) )
5755			{
5756	0		const char *__acts = _tag_guesser_actions + _tag_guesser_eof_actions[cs];
5757	0		unsigned int __nacts = (unsigned int) *__acts++;
5758	0	0	while ( __nacts-- > 0 ) {
5759	0	0	switch ( *__acts++ ) {
5760			case 7:
5761	0	0	{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); }
5762			break;
5763			}
5764			}
5765			}
5766
5767			}
5768
5769			}
5770
5771			// Add proper names
5772	0		analyze_proper_names(form, form_lc, lemmas);
5773	0		}
5774
5775	0		bool english_morpho_guesser::analyze_proper_names(string_piece form, string_piece form_lc, vector& lemmas) const {
5776			// NNP if form_lc != form or form.str[0] =~ /[0-9']/, NNPS if form_lc != form
5777	0	0	bool is_NNP = form.str != form_lc.str \|\| (form.len && (form.str == '\'' \|\| (form.str >= '0' && *form.str <= '9')));
		0
		0
		0
5778	0		bool is_NNPS = form.str != form_lc.str;
5779	0	0	if (!is_NNP && !is_NNPS) return false;
5780
5781			bool was_NNP = false, was_NNPS = false;
5782	0	0	for (auto&& lemma : lemmas) {
5783	0		was_NNP \|= lemma.tag == NNP;
5784	0		was_NNPS \|= lemma.tag == NNPS;
5785			}
5786	0	0	if (!((is_NNP && !was_NNP) \|\| (is_NNPS && !was_NNPS))) return false;
		0
5787
5788			string lemma(form.str, form.len);
5789	0	0	if (is_NNP && !was_NNP) add(NNP, lemma, lemmas);
5790	0	0	if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas);
		0
5791			return true;
5792			}
5793
5794			inline void english_morpho_guesser::add(const string& tag, const string& form, vector& lemmas) const {
5795	0	0	lemmas.emplace_back(form, tag);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
5796			}
5797
5798			inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, vector& lemmas) const {
5799			add(tag, form, lemmas);
5800			add(tag2, form, lemmas);
5801			}
5802
5803	0		inline void english_morpho_guesser::add(const string& tag, const string& form, unsigned negation_len, vector& lemmas) const {
5804	0	0	lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
5805	0		}
5806
5807	0		inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector& lemmas) const {
5808	0		add(tag, form, negation_len, lemmas);
5809	0		add(tag2, form, negation_len, lemmas);
5810	0		}
5811
5812			// Common definitions (written backwards)
5813			#define REM(str, len) (str.substr(0, str.size() - len))
5814			#define REM_ADD(str, len, add) (str.substr(0, str.size() - len).append(add))
5815
5816			static const char _NNS_actions[] = {
5817			0, 1, 0, 1, 1, 1, 2, 1,
5818			3, 1, 4, 1, 5, 1, 6, 1,
5819			7, 1, 8, 1, 9, 1, 10, 1,
5820			11, 1, 12, 1, 13
5821			};
5822
5823			static const char _NNS_key_offsets[] = {
5824			0, 0, 2, 3, 4, 5, 7, 17,
5825			17, 29, 30, 35, 35, 36, 37, 37,
5826			37, 44, 45, 53, 63, 72
5827			};
5828
5829			static const char _NNS_trans_keys[] = {
5830			110, 115, 101, 109, 101, 99, 115, 98,
5831			100, 102, 104, 106, 110, 112, 116, 118,
5832			122, 104, 122, 98, 100, 102, 103, 106,
5833			110, 112, 116, 118, 120, 111, 97, 101,
5834			105, 111, 117, 105, 119, 104, 105, 111,
5835			115, 118, 120, 122, 115, 97, 101, 105,
5836			110, 111, 114, 115, 117, 98, 100, 102,
5837			104, 106, 110, 112, 116, 118, 122, 97,
5838			101, 105, 111, 117, 121, 122, 98, 120,
5839			0
5840			};
5841
5842			static const char _NNS_single_lengths[] = {
5843			0, 2, 1, 1, 1, 2, 0, 0,
5844			2, 1, 5, 0, 1, 1, 0, 0,
5845			7, 1, 8, 0, 7, 0
5846			};
5847
5848			static const char _NNS_range_lengths[] = {
5849			0, 0, 0, 0, 0, 0, 5, 0,
5850			5, 0, 0, 0, 0, 0, 0, 0,
5851			0, 0, 0, 5, 1, 0
5852			};
5853
5854			static const char _NNS_index_offsets[] = {
5855			0, 0, 3, 5, 7, 9, 12, 18,
5856			19, 27, 29, 35, 36, 38, 40, 41,
5857			42, 50, 52, 61, 67, 76
5858			};
5859
5860			static const char _NNS_indicies[] = {
5861			0, 2, 1, 3, 1, 4, 1, 6,
5862			5, 7, 7, 1, 8, 8, 8, 8,
5863			8, 1, 9, 11, 10, 10, 10, 10,
5864			10, 10, 1, 12, 1, 13, 13, 13,
5865			13, 13, 1, 14, 15, 1, 16, 1,
5866			17, 1, 18, 19, 20, 21, 22, 7,
5867			23, 1, 24, 1, 25, 25, 25, 26,
5868			25, 27, 28, 29, 1, 30, 30, 30,
5869			30, 30, 1, 31, 31, 31, 31, 31,
5870			31, 33, 32, 1, 17, 0
5871			};
5872
5873			static const char _NNS_trans_targs[] = {
5874			2, 0, 4, 3, 15, 15, 16, 15,
5875			7, 15, 15, 17, 15, 11, 15, 13,
5876			15, 15, 5, 6, 8, 18, 12, 20,
5877			15, 15, 9, 10, 15, 19, 15, 15,
5878			14, 21
5879			};
5880
5881			static const char _NNS_trans_actions[] = {
5882			0, 0, 0, 0, 1, 27, 27, 21,
5883			0, 23, 25, 25, 19, 0, 17, 0,
5884			5, 11, 0, 0, 0, 21, 0, 21,
5885			3, 9, 0, 0, 15, 9, 7, 13,
5886			0, 15
5887			};
5888
5889			static const int NNS_start = 1;
5890
5891	0		void english_morpho_guesser::add_NNS(const string& form, unsigned negation_len, vector& lemmas) const {
5892	0		const char* p = form.c_str() + negation_len; int cs;
5893			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
5894
5895			{
5896			cs = NNS_start;
5897			}
5898
5899			{
5900			int _klen;
5901			unsigned int _trans;
5902			const char *_acts;
5903			unsigned int _nacts;
5904			const char *_keys;
5905
5906	0	0	if ( p == ( (form.c_str() + form.size())) )
5907			goto _test_eof;
5908			if ( cs == 0 )
5909			goto _out;
5910			_resume:
5911	0		_keys = _NNS_trans_keys + _NNS_key_offsets[cs];
5912	0		_trans = _NNS_index_offsets[cs];
5913
5914	0		_klen = _NNS_single_lengths[cs];
5915	0	0	if ( _klen > 0 ) {
5916			const char *_lower = _keys;
5917			const char *_mid;
5918	0		const char *_upper = _keys + _klen - 1;
5919			while (1) {
5920	0	0	if ( _upper < _lower )
5921			break;
5922
5923	0		_mid = _lower + ((_upper-_lower) >> 1);
5924	0	0	if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
5925	0		_upper = _mid - 1;
5926	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
5927	0		_lower = _mid + 1;
5928			else {
5929	0		_trans += (unsigned int)(_mid - _keys);
5930	0		goto _match;
5931			}
5932			}
5933	0		_keys += _klen;
5934	0		_trans += _klen;
5935			}
5936
5937	0		_klen = _NNS_range_lengths[cs];
5938	0	0	if ( _klen > 0 ) {
5939			const char *_lower = _keys;
5940			const char *_mid;
5941	0		const char *_upper = _keys + (_klen<<1) - 2;
5942			while (1) {
5943	0	0	if ( _upper < _lower )
5944			break;
5945
5946	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
5947	0	0	if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
5948	0		_upper = _mid - 2;
5949	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
5950	0		_lower = _mid + 2;
5951			else {
5952	0		_trans += (unsigned int)((_mid - _keys)>>1);
5953	0		goto _match;
5954			}
5955			}
5956	0		_trans += _klen;
5957			}
5958
5959			_match:
5960	0		_trans = _NNS_indicies[_trans];
5961	0		cs = _NNS_trans_targs[_trans];
5962
5963	0	0	if ( _NNS_trans_actions[_trans] == 0 )
5964			goto _again;
5965
5966	0		_acts = _NNS_actions + _NNS_trans_actions[_trans];
5967	0		_nacts = (unsigned int) *_acts++;
5968	0	0	while ( _nacts-- > 0 )
5969			{
5970	0		switch ( *_acts++ )
5971			{
5972			case 0:
5973	0	0	{ if (best > 'a') best = 'a', remove = 2, append = "an"; }
5974			break;
5975			case 1:
5976	0	0	{ if (best > 'b') best = 'b', remove = 1, append = nullptr; }
5977			break;
5978			case 2:
5979	0	0	{ if (best > 'c') best = 'c', remove = 3, append = "fe"; }
5980			break;
5981			case 3:
5982	0	0	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
5983			break;
5984			case 4:
5985	0	0	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
5986			break;
5987			case 5:
5988	0	0	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
5989			break;
5990			case 6:
5991	0	0	{ if (best > 'g') best = 'g', remove = 1, append = nullptr; }
5992			break;
5993			case 7:
5994	0	0	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
5995			break;
5996			case 8:
5997	0	0	{ if (best > 'i') best = 'i', remove = 1, append = nullptr; }
5998			break;
5999			case 9:
6000	0	0	{ if (best > 'j') best = 'j', remove = 1, append = nullptr; }
6001			break;
6002			case 10:
6003	0	0	{ if (best > 'k') best = 'k', remove = 2, append = nullptr; }
6004			break;
6005			case 11:
6006	0	0	{ if (best > 'l') best = 'l', remove = 3, append = "y"; }
6007			break;
6008			case 12:
6009	0	0	{ if (best > 'm') best = 'm', remove = 2, append = nullptr; }
6010			break;
6011			case 13:
6012	0	0	{ if (best > 'n') best = 'n', remove = 1, append = nullptr; }
6013			break;
6014			}
6015			}
6016
6017			_again:
6018	0	0	if ( cs == 0 )
6019			goto _out;
6020	0	0	if ( ++p != ( (form.c_str() + form.size())) )
6021			goto _resume;
6022			_test_eof: {}
6023			_out: {}
6024			}
6025
6026	0	0	add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
		0
		0
6027	0		}
6028
6029			static const char _NNPS_actions[] = {
6030			0, 1, 1, 1, 2, 1, 4, 1,
6031			5, 1, 6, 1, 7, 1, 8, 1,
6032			9, 1, 10, 1, 11, 1, 12, 1,
6033			14, 1, 15, 1, 16, 2, 0, 1,
6034			2, 3, 4, 2, 13, 14
6035			};
6036
6037			static const unsigned char _NNPS_key_offsets[] = {
6038			0, 0, 4, 6, 8, 10, 12, 16,
6039			36, 36, 60, 62, 72, 72, 74, 76,
6040			78, 78, 98, 98, 100, 102, 104, 104,
6041			118, 120, 136, 156, 174, 174
6042			};
6043
6044			static const char _NNPS_trans_keys[] = {
6045			78, 83, 110, 115, 69, 101, 77, 109,
6046			77, 109, 69, 101, 67, 83, 99, 115,
6047			66, 68, 70, 72, 74, 78, 80, 84,
6048			86, 90, 98, 100, 102, 104, 106, 110,
6049			112, 116, 118, 122, 72, 90, 104, 122,
6050			66, 68, 70, 71, 74, 78, 80, 84,
6051			86, 88, 98, 100, 102, 103, 106, 110,
6052			112, 116, 118, 120, 79, 111, 65, 69,
6053			73, 79, 85, 97, 101, 105, 111, 117,
6054			73, 105, 87, 119, 87, 119, 66, 68,
6055			70, 72, 74, 78, 80, 84, 86, 90,
6056			98, 100, 102, 104, 106, 110, 112, 116,
6057			118, 122, 73, 105, 69, 101, 69, 101,
6058			72, 73, 79, 83, 86, 88, 90, 104,
6059			105, 111, 115, 118, 120, 122, 83, 115,
6060			65, 69, 73, 78, 79, 82, 83, 85,
6061			97, 101, 105, 110, 111, 114, 115, 117,
6062			66, 68, 70, 72, 74, 78, 80, 84,
6063			86, 90, 98, 100, 102, 104, 106, 110,
6064			112, 116, 118, 122, 65, 69, 73, 79,
6065			85, 89, 90, 97, 101, 105, 111, 117,
6066			121, 122, 66, 88, 98, 120, 72, 73,
6067			79, 83, 86, 88, 90, 104, 105, 111,
6068			115, 118, 120, 122, 0
6069			};
6070
6071			static const char _NNPS_single_lengths[] = {
6072			0, 4, 2, 2, 2, 2, 4, 0,
6073			0, 4, 2, 10, 0, 2, 2, 2,
6074			0, 0, 0, 2, 2, 2, 0, 14,
6075			2, 16, 0, 14, 0, 14
6076			};
6077
6078			static const char _NNPS_range_lengths[] = {
6079			0, 0, 0, 0, 0, 0, 0, 10,
6080			0, 10, 0, 0, 0, 0, 0, 0,
6081			0, 10, 0, 0, 0, 0, 0, 0,
6082			0, 0, 10, 2, 0, 0
6083			};
6084
6085			static const unsigned char _NNPS_index_offsets[] = {
6086			0, 0, 5, 8, 11, 14, 17, 22,
6087			33, 34, 49, 52, 63, 64, 67, 70,
6088			73, 74, 85, 86, 89, 92, 95, 96,
6089			111, 114, 131, 142, 159, 160
6090			};
6091
6092			static const char _NNPS_indicies[] = {
6093			0, 2, 3, 4, 1, 5, 6, 1,
6094			7, 8, 1, 8, 8, 1, 10, 11,
6095			9, 12, 12, 12, 12, 1, 13, 13,
6096			13, 13, 13, 13, 13, 13, 13, 13,
6097			1, 14, 16, 15, 16, 15, 15, 15,
6098			15, 15, 15, 15, 15, 15, 15, 15,
6099			1, 17, 17, 1, 18, 18, 18, 18,
6100			18, 18, 18, 18, 18, 18, 1, 19,
6101			20, 21, 1, 22, 23, 1, 23, 23,
6102			1, 24, 25, 25, 25, 25, 25, 25,
6103			25, 25, 25, 25, 1, 26, 21, 21,
6104			1, 6, 6, 1, 11, 11, 9, 1,
6105			27, 28, 29, 30, 31, 12, 32, 27,
6106			33, 29, 30, 34, 12, 32, 1, 35,
6107			35, 1, 36, 36, 36, 37, 36, 38,
6108			39, 40, 36, 36, 36, 37, 36, 38,
6109			39, 40, 1, 41, 41, 41, 41, 41,
6110			41, 41, 41, 41, 41, 1, 42, 42,
6111			42, 42, 42, 42, 44, 42, 42, 42,
6112			42, 42, 42, 44, 43, 43, 1, 24,
6113			27, 33, 29, 30, 34, 12, 32, 27,
6114			33, 29, 30, 34, 12, 32, 1, 0
6115			};
6116
6117			static const char _NNPS_trans_targs[] = {
6118			2, 0, 5, 20, 21, 3, 4, 22,
6119			22, 22, 23, 29, 22, 8, 22, 22,
6120			24, 22, 12, 22, 14, 15, 22, 22,
6121			22, 18, 22, 6, 7, 9, 25, 13,
6122			27, 17, 19, 22, 22, 10, 11, 22,
6123			26, 22, 22, 16, 28
6124			};
6125
6126			static const char _NNPS_trans_actions[] = {
6127			0, 0, 0, 0, 0, 0, 0, 29,
6128			1, 27, 27, 27, 21, 0, 35, 25,
6129			25, 19, 0, 17, 0, 0, 32, 5,
6130			11, 0, 23, 0, 0, 0, 21, 0,
6131			21, 0, 0, 3, 9, 0, 0, 15,
6132			9, 7, 13, 0, 15
6133			};
6134
6135			static const int NNPS_start = 1;
6136
6137	0		void english_morpho_guesser::add_NNPS(const string& form, vector& lemmas) const {
6138			const char* p = form.c_str(); int cs;
6139			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
6140
6141			{
6142			cs = NNPS_start;
6143			}
6144
6145			{
6146			int _klen;
6147			unsigned int _trans;
6148			const char *_acts;
6149			unsigned int _nacts;
6150			const char *_keys;
6151
6152	0	0	if ( p == ( (form.c_str() + form.size())) )
6153			goto _test_eof;
6154			if ( cs == 0 )
6155			goto _out;
6156			_resume:
6157	0		_keys = _NNPS_trans_keys + _NNPS_key_offsets[cs];
6158	0		_trans = _NNPS_index_offsets[cs];
6159
6160	0		_klen = _NNPS_single_lengths[cs];
6161	0	0	if ( _klen > 0 ) {
6162			const char *_lower = _keys;
6163			const char *_mid;
6164	0		const char *_upper = _keys + _klen - 1;
6165			while (1) {
6166	0	0	if ( _upper < _lower )
6167			break;
6168
6169	0		_mid = _lower + ((_upper-_lower) >> 1);
6170	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6171	0		_upper = _mid - 1;
6172	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6173	0		_lower = _mid + 1;
6174			else {
6175	0		_trans += (unsigned int)(_mid - _keys);
6176	0		goto _match;
6177			}
6178			}
6179	0		_keys += _klen;
6180	0		_trans += _klen;
6181			}
6182
6183	0		_klen = _NNPS_range_lengths[cs];
6184	0	0	if ( _klen > 0 ) {
6185			const char *_lower = _keys;
6186			const char *_mid;
6187	0		const char *_upper = _keys + (_klen<<1) - 2;
6188			while (1) {
6189	0	0	if ( _upper < _lower )
6190			break;
6191
6192	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
6193	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6194	0		_upper = _mid - 2;
6195	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6196	0		_lower = _mid + 2;
6197			else {
6198	0		_trans += (unsigned int)((_mid - _keys)>>1);
6199	0		goto _match;
6200			}
6201			}
6202	0		_trans += _klen;
6203			}
6204
6205			_match:
6206	0		_trans = _NNPS_indicies[_trans];
6207	0		cs = _NNPS_trans_targs[_trans];
6208
6209	0	0	if ( _NNPS_trans_actions[_trans] == 0 )
6210			goto _again;
6211
6212	0		_acts = _NNPS_actions + _NNPS_trans_actions[_trans];
6213	0		_nacts = (unsigned int) *_acts++;
6214	0	0	while ( _nacts-- > 0 )
6215			{
6216	0		switch ( *_acts++ )
6217			{
6218			case 0:
6219	0	0	{ if (best > 'a') best = 'a', remove = 2, append = "AN"; }
6220			break;
6221			case 1:
6222	0	0	{ if (best > 'b') best = 'b', remove = 2, append = "an"; }
6223			break;
6224			case 2:
6225	0	0	{ if (best > 'c') best = 'c', remove = 1, append = nullptr; }
6226			break;
6227			case 3:
6228	0	0	{ if (best > 'd') best = 'd', remove = 3, append = "FE"; }
6229			break;
6230			case 4:
6231	0	0	{ if (best > 'e') best = 'e', remove = 3, append = "fe"; }
6232			break;
6233			case 5:
6234	0	0	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
6235			break;
6236			case 6:
6237	0	0	{ if (best > 'g') best = 'g', remove = 1, append = nullptr; }
6238			break;
6239			case 7:
6240	0	0	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
6241			break;
6242			case 8:
6243	0	0	{ if (best > 'i') best = 'i', remove = 1, append = nullptr; }
6244			break;
6245			case 9:
6246	0	0	{ if (best > 'j') best = 'j', remove = 2, append = nullptr; }
6247			break;
6248			case 10:
6249	0	0	{ if (best > 'k') best = 'k', remove = 1, append = nullptr; }
6250			break;
6251			case 11:
6252	0	0	{ if (best > 'l') best = 'l', remove = 1, append = nullptr; }
6253			break;
6254			case 12:
6255	0	0	{ if (best > 'm') best = 'm', remove = 2, append = nullptr; }
6256			break;
6257			case 13:
6258	0	0	{ if (best > 'n') best = 'n', remove = 3, append = "Y"; }
6259			break;
6260			case 14:
6261	0	0	{ if (best > 'o') best = 'o', remove = 3, append = "y"; }
6262			break;
6263			case 15:
6264	0	0	{ if (best > 'p') best = 'p', remove = 2, append = nullptr; }
6265			break;
6266			case 16:
6267	0	0	{ if (best > 'q') best = 'q', remove = 1, append = nullptr; }
6268			break;
6269			}
6270			}
6271
6272			_again:
6273	0	0	if ( cs == 0 )
6274			goto _out;
6275	0	0	if ( ++p != ( (form.c_str() + form.size())) )
6276			goto _resume;
6277			_test_eof: {}
6278			_out: {}
6279			}
6280
6281	0	0	add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
		0
6282	0		}
6283
6284			static const char _VBG_actions[] = {
6285			0, 1, 1, 1, 2, 1, 4, 1,
6286			5, 1, 6, 1, 7, 1, 9, 1,
6287			10, 1, 11, 1, 12, 1, 13, 1,
6288			14, 1, 15, 1, 16, 1, 17, 2,
6289			0, 12, 2, 3, 4, 2, 5, 9,
6290			2, 5, 10, 2, 8, 9, 2, 9,
6291			10, 2, 11, 12, 3, 0, 2, 12,
6292			3, 2, 11, 12
6293			};
6294
6295			static const short _VBG_key_offsets[] = {
6296			0, 0, 1, 2, 3, 9, 14, 24,
6297			29, 34, 44, 46, 47, 48, 49, 50,
6298			51, 52, 59, 66, 68, 70, 71, 72,
6299			73, 74, 75, 76, 81, 89, 90, 91,
6300			92, 93, 94, 96, 97, 98, 99, 100,
6301			101, 102, 127, 127, 136, 137, 142, 153,
6302			162, 171, 181, 186, 191, 197, 207, 207,
6303			216, 228, 229, 240, 240, 249, 258, 267,
6304			276, 285, 290, 302, 313, 318, 324, 334,
6305			344, 355, 362, 373, 382, 391, 391, 402,
6306			413, 415, 416, 417, 417, 418, 426, 437,
6307			442, 448, 458, 468, 479, 486, 497, 504,
6308			510, 519, 528, 537, 543
6309			};
6310
6311			static const char _VBG_trans_keys[] = {
6312			103, 110, 105, 97, 101, 105, 111, 117,
6313			121, 97, 101, 105, 111, 117, 98, 100,
6314			102, 104, 106, 110, 112, 116, 118, 122,
6315			97, 101, 105, 111, 117, 97, 101, 105,
6316			111, 117, 98, 100, 102, 104, 106, 110,
6317			112, 116, 118, 122, 98, 114, 105, 114,
6318			112, 105, 109, 101, 97, 101, 105, 111,
6319			117, 98, 122, 97, 101, 105, 111, 117,
6320			98, 122, 97, 122, 98, 114, 105, 114,
6321			112, 105, 109, 101, 97, 101, 105, 111,
6322			117, 97, 101, 105, 110, 111, 115, 117,
6323			120, 105, 112, 105, 109, 101, 98, 114,
6324			105, 114, 112, 105, 109, 101, 98, 99,
6325			100, 102, 103, 104, 106, 107, 108, 109,
6326			110, 111, 112, 113, 114, 115, 116, 117,
6327			118, 119, 120, 121, 122, 97, 105, 97,
6328			98, 101, 105, 111, 117, 122, 99, 120,
6329			113, 97, 101, 105, 111, 117, 98, 99,
6330			100, 105, 111, 117, 122, 97, 101, 102,
6331			120, 97, 100, 101, 105, 111, 117, 122,
6332			98, 120, 97, 101, 102, 105, 111, 117,
6333			122, 98, 120, 97, 101, 103, 105, 110,
6334			111, 117, 122, 98, 120, 97, 101, 105,
6335			111, 117, 101, 110, 111, 115, 120, 101,
6336			110, 111, 112, 115, 120, 97, 101, 104,
6337			105, 111, 116, 117, 122, 98, 120, 97,
6338			101, 105, 106, 111, 117, 122, 98, 120,
6339			98, 99, 100, 105, 107, 111, 117, 122,
6340			97, 101, 102, 120, 105, 97, 101, 105,
6341			108, 111, 114, 117, 119, 122, 98, 120,
6342			97, 101, 105, 109, 111, 117, 122, 98,
6343			120, 97, 101, 105, 110, 111, 117, 122,
6344			98, 120, 97, 101, 105, 111, 112, 117,
6345			122, 98, 120, 97, 101, 105, 111, 113,
6346			117, 122, 98, 120, 97, 101, 105, 111,
6347			114, 117, 122, 98, 120, 97, 101, 105,
6348			111, 117, 98, 99, 100, 105, 108, 111,
6349			116, 117, 97, 101, 102, 122, 101, 110,
6350			111, 115, 120, 98, 104, 106, 116, 118,
6351			122, 101, 110, 111, 115, 120, 101, 110,
6352			111, 112, 115, 120, 101, 105, 110, 111,
6353			115, 120, 98, 116, 118, 122, 101, 105,
6354			110, 111, 115, 120, 98, 116, 118, 122,
6355			101, 110, 111, 115, 120, 98, 104, 106,
6356			116, 118, 122, 98, 101, 110, 111, 114,
6357			115, 120, 101, 110, 111, 115, 120, 98,
6358			104, 106, 116, 118, 122, 97, 101, 105,
6359			111, 115, 117, 122, 98, 120, 97, 101,
6360			105, 111, 116, 117, 122, 98, 120, 122,
6361			98, 100, 102, 104, 106, 110, 112, 116,
6362			118, 120, 122, 98, 100, 102, 104, 106,
6363			110, 112, 116, 118, 120, 98, 114, 112,
6364			114, 113, 97, 101, 105, 108, 111, 117,
6365			98, 122, 101, 110, 111, 115, 120, 98,
6366			104, 106, 116, 118, 122, 101, 110, 111,
6367			115, 120, 101, 110, 111, 112, 115, 120,
6368			101, 105, 110, 111, 115, 120, 98, 116,
6369			118, 122, 101, 105, 110, 111, 115, 120,
6370			98, 116, 118, 122, 101, 110, 111, 115,
6371			120, 98, 104, 106, 116, 118, 122, 98,
6372			101, 110, 111, 114, 115, 120, 101, 110,
6373			111, 115, 120, 98, 104, 106, 116, 118,
6374			122, 97, 101, 105, 111, 117, 98, 122,
6375			97, 101, 105, 111, 117, 121, 97, 101,
6376			105, 111, 117, 118, 122, 98, 120, 97,
6377			101, 105, 111, 117, 119, 122, 98, 120,
6378			97, 101, 105, 111, 117, 120, 122, 98,
6379			119, 97, 101, 105, 111, 117, 121, 97,
6380			101, 105, 111, 117, 121, 122, 98, 120,
6381			0
6382			};
6383
6384			static const char _VBG_single_lengths[] = {
6385			0, 1, 1, 1, 6, 5, 0, 5,
6386			5, 0, 2, 1, 1, 1, 1, 1,
6387			1, 5, 5, 0, 2, 1, 1, 1,
6388			1, 1, 1, 5, 8, 1, 1, 1,
6389			1, 1, 2, 1, 1, 1, 1, 1,
6390			1, 23, 0, 7, 1, 5, 7, 7,
6391			7, 8, 5, 5, 6, 8, 0, 7,
6392			8, 1, 9, 0, 7, 7, 7, 7,
6393			7, 5, 8, 5, 5, 6, 6, 6,
6394			5, 7, 5, 7, 7, 0, 1, 1,
6395			2, 1, 1, 0, 1, 6, 5, 5,
6396			6, 6, 6, 5, 7, 5, 5, 6,
6397			7, 7, 7, 6, 7
6398			};
6399
6400			static const char _VBG_range_lengths[] = {
6401			0, 0, 0, 0, 0, 0, 5, 0,
6402			0, 5, 0, 0, 0, 0, 0, 0,
6403			0, 1, 1, 1, 0, 0, 0, 0,
6404			0, 0, 0, 0, 0, 0, 0, 0,
6405			0, 0, 0, 0, 0, 0, 0, 0,
6406			0, 1, 0, 1, 0, 0, 2, 1,
6407			1, 1, 0, 0, 0, 1, 0, 1,
6408			2, 0, 1, 0, 1, 1, 1, 1,
6409			1, 0, 2, 3, 0, 0, 2, 2,
6410			3, 0, 3, 1, 1, 0, 5, 5,
6411			0, 0, 0, 0, 0, 1, 3, 0,
6412			0, 2, 2, 3, 0, 3, 1, 0,
6413			1, 1, 1, 0, 1
6414			};
6415
6416			static const short _VBG_index_offsets[] = {
6417			0, 0, 2, 4, 6, 13, 19, 25,
6418			31, 37, 43, 46, 48, 50, 52, 54,
6419			56, 58, 65, 72, 74, 77, 79, 81,
6420			83, 85, 87, 89, 95, 104, 106, 108,
6421			110, 112, 114, 117, 119, 121, 123, 125,
6422			127, 129, 154, 155, 164, 166, 172, 182,
6423			191, 200, 210, 216, 222, 229, 239, 240,
6424			249, 260, 262, 273, 274, 283, 292, 301,
6425			310, 319, 325, 336, 345, 351, 358, 367,
6426			376, 385, 393, 402, 411, 420, 421, 428,
6427			435, 438, 440, 442, 443, 445, 453, 462,
6428			468, 475, 484, 493, 502, 510, 519, 526,
6429			533, 542, 551, 560, 567
6430			};
6431
6432			static const unsigned char _VBG_indicies[] = {
6433			0, 1, 2, 1, 3, 1, 4, 4,
6434			4, 4, 4, 4, 1, 5, 5, 5,
6435			5, 6, 1, 7, 7, 7, 7, 7,
6436			1, 8, 8, 8, 8, 9, 1, 5,
6437			5, 5, 5, 10, 1, 11, 11, 11,
6438			11, 11, 1, 11, 12, 1, 11, 1,
6439			13, 1, 11, 1, 14, 1, 11, 1,
6440			11, 1, 5, 5, 5, 5, 6, 15,
6441			1, 5, 5, 5, 5, 6, 16, 1,
6442			4, 1, 17, 18, 1, 17, 1, 19,
6443			1, 17, 1, 20, 1, 17, 1, 17,
6444			1, 21, 22, 21, 23, 24, 1, 25,
6445			26, 25, 27, 28, 29, 25, 30, 1,
6446			31, 1, 31, 1, 32, 1, 31, 1,
6447			31, 1, 33, 34, 1, 33, 1, 35,
6448			1, 33, 1, 36, 1, 33, 1, 33,
6449			1, 38, 39, 40, 41, 42, 43, 44,
6450			45, 46, 47, 48, 49, 50, 51, 52,
6451			53, 54, 55, 56, 57, 58, 59, 60,
6452			37, 1, 1, 61, 62, 61, 61, 61,
6453			61, 63, 63, 1, 64, 1, 65, 65,
6454			65, 65, 65, 1, 67, 68, 67, 66,
6455			66, 66, 67, 66, 67, 1, 69, 62,
6456			69, 69, 69, 69, 63, 63, 1, 61,
6457			61, 62, 61, 61, 61, 63, 63, 1,
6458			66, 66, 68, 66, 70, 66, 66, 67,
6459			67, 1, 71, 71, 71, 71, 71, 1,
6460			72, 73, 74, 75, 76, 1, 72, 73,
6461			74, 11, 75, 76, 1, 61, 61, 62,
6462			61, 61, 77, 61, 63, 63, 1, 78,
6463			61, 61, 61, 62, 61, 61, 63, 63,
6464			1, 63, 79, 63, 61, 62, 61, 61,
6465			63, 61, 63, 1, 7, 1, 61, 61,
6466			61, 68, 61, 80, 61, 80, 67, 67,
6467			1, 5, 61, 61, 61, 62, 61, 61,
6468			63, 63, 1, 81, 81, 82, 62, 81,
6469			81, 63, 63, 1, 81, 81, 81, 81,
6470			62, 81, 63, 63, 1, 61, 61, 61,
6471			61, 62, 61, 63, 63, 1, 61, 83,
6472			61, 84, 62, 61, 63, 63, 1, 5,
6473			5, 5, 5, 6, 1, 85, 86, 85,
6474			5, 86, 5, 86, 6, 5, 85, 1,
6475			87, 88, 89, 90, 91, 85, 85, 85,
6476			1, 87, 92, 89, 93, 94, 1, 87,
6477			92, 89, 17, 93, 94, 1, 87, 17,
6478			88, 89, 90, 91, 85, 85, 1, 87,
6479			20, 88, 89, 90, 91, 85, 85, 1,
6480			95, 88, 89, 90, 91, 85, 85, 85,
6481			1, 17, 87, 92, 89, 18, 93, 94,
6482			1, 87, 97, 89, 98, 99, 96, 96,
6483			96, 1, 66, 66, 66, 66, 100, 66,
6484			67, 67, 1, 101, 102, 103, 61, 62,
6485			61, 63, 63, 1, 104, 106, 106, 106,
6486			106, 106, 106, 105, 107, 107, 107, 107,
6487			107, 107, 1, 31, 108, 1, 31, 1,
6488			109, 1, 105, 110, 104, 5, 5, 5,
6489			112, 5, 6, 111, 1, 113, 114, 115,
6490			116, 117, 111, 111, 111, 1, 113, 118,
6491			115, 119, 120, 1, 113, 118, 115, 33,
6492			119, 120, 1, 113, 33, 114, 115, 116,
6493			117, 111, 111, 1, 113, 36, 114, 115,
6494			116, 117, 111, 111, 1, 121, 114, 115,
6495			116, 117, 111, 111, 111, 1, 33, 113,
6496			118, 115, 34, 119, 120, 1, 113, 123,
6497			115, 124, 125, 122, 122, 122, 1, 5,
6498			5, 5, 5, 6, 111, 1, 4, 4,
6499			4, 4, 4, 4, 1, 66, 66, 66,
6500			66, 66, 68, 67, 67, 1, 81, 81,
6501			81, 81, 81, 62, 63, 63, 1, 81,
6502			81, 81, 81, 81, 62, 63, 63, 1,
6503			126, 126, 126, 126, 126, 4, 1, 127,
6504			127, 127, 127, 127, 129, 130, 128, 1,
6505			0
6506			};
6507
6508			static const char _VBG_trans_targs[] = {
6509			2, 0, 3, 41, 42, 42, 44, 42,
6510			42, 44, 44, 51, 52, 13, 15, 42,
6511			42, 68, 69, 23, 25, 77, 78, 83,
6512			84, 42, 80, 29, 82, 31, 33, 42,
6513			32, 87, 88, 37, 39, 4, 43, 46,
6514			47, 48, 49, 53, 55, 56, 58, 60,
6515			61, 19, 62, 63, 64, 75, 76, 95,
6516			96, 97, 98, 99, 100, 5, 45, 42,
6517			42, 6, 7, 42, 45, 8, 50, 9,
6518			10, 11, 12, 14, 16, 54, 42, 57,
6519			59, 17, 18, 65, 66, 67, 74, 20,
6520			70, 22, 71, 72, 21, 24, 26, 73,
6521			67, 70, 71, 72, 45, 27, 85, 94,
6522			42, 42, 79, 28, 81, 30, 42, 86,
6523			93, 34, 89, 36, 90, 91, 35, 38,
6524			40, 92, 86, 89, 90, 91, 65, 65,
6525			42, 42, 45
6526			};
6527
6528			static const char _VBG_trans_actions[] = {
6529			0, 0, 0, 29, 23, 15, 15, 3,
6530			46, 46, 40, 0, 0, 0, 0, 5,
6531			34, 0, 0, 0, 0, 15, 15, 15,
6532			15, 11, 11, 0, 11, 0, 0, 9,
6533			0, 0, 0, 0, 0, 0, 0, 0,
6534			0, 0, 0, 0, 0, 0, 0, 0,
6535			0, 0, 0, 0, 0, 0, 0, 21,
6536			0, 0, 0, 23, 0, 0, 19, 19,
6537			7, 0, 0, 49, 49, 0, 49, 0,
6538			0, 0, 0, 0, 0, 19, 17, 19,
6539			49, 0, 0, 27, 27, 0, 0, 0,
6540			0, 0, 0, 0, 0, 0, 0, 0,
6541			25, 25, 25, 25, 56, 0, 9, 9,
6542			13, 43, 43, 0, 9, 0, 37, 0,
6543			0, 0, 0, 0, 0, 0, 0, 0,
6544			0, 0, 7, 7, 7, 7, 23, 1,
6545			31, 1, 52
6546			};
6547
6548			static const char _VBG_eof_actions[] = {
6549			0, 0, 0, 0, 0, 0, 0, 0,
6550			0, 0, 0, 0, 0, 0, 0, 0,
6551			0, 0, 0, 0, 0, 0, 0, 0,
6552			0, 0, 0, 0, 0, 0, 0, 0,
6553			0, 0, 0, 0, 0, 0, 0, 0,
6554			0, 0, 0, 3, 0, 0, 3, 3,
6555			3, 3, 0, 3, 3, 3, 0, 3,
6556			3, 0, 3, 0, 3, 3, 3, 3,
6557			3, 0, 0, 25, 25, 25, 25, 25,
6558			25, 25, 25, 3, 3, 0, 0, 0,
6559			0, 0, 0, 0, 0, 0, 7, 7,
6560			7, 7, 7, 7, 7, 7, 0, 0,
6561			3, 3, 3, 0, 3
6562			};
6563
6564			static const int VBG_start = 1;
6565
6566	0		void english_morpho_guesser::add_VBG(const string& form, vector& lemmas) const {
6567			const char* p = form.c_str(); int cs;
6568			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
6569
6570			{
6571			cs = VBG_start;
6572			}
6573
6574			{
6575			int _klen;
6576			unsigned int _trans;
6577			const char *_acts;
6578			unsigned int _nacts;
6579			const char *_keys;
6580
6581	0	0	if ( p == ( (form.c_str() + form.size())) )
6582			goto _test_eof;
6583			if ( cs == 0 )
6584			goto _out;
6585			_resume:
6586	0		_keys = _VBG_trans_keys + _VBG_key_offsets[cs];
6587	0		_trans = _VBG_index_offsets[cs];
6588
6589	0		_klen = _VBG_single_lengths[cs];
6590	0	0	if ( _klen > 0 ) {
6591			const char *_lower = _keys;
6592			const char *_mid;
6593	0		const char *_upper = _keys + _klen - 1;
6594			while (1) {
6595	0	0	if ( _upper < _lower )
6596			break;
6597
6598	0		_mid = _lower + ((_upper-_lower) >> 1);
6599	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
6600	0		_upper = _mid - 1;
6601	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
6602	0		_lower = _mid + 1;
6603			else {
6604	0		_trans += (unsigned int)(_mid - _keys);
6605	0		goto _match;
6606			}
6607			}
6608	0		_keys += _klen;
6609	0		_trans += _klen;
6610			}
6611
6612	0		_klen = _VBG_range_lengths[cs];
6613	0	0	if ( _klen > 0 ) {
6614			const char *_lower = _keys;
6615			const char *_mid;
6616	0		const char *_upper = _keys + (_klen<<1) - 2;
6617			while (1) {
6618	0	0	if ( _upper < _lower )
6619			break;
6620
6621	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
6622	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
6623	0		_upper = _mid - 2;
6624	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
6625	0		_lower = _mid + 2;
6626			else {
6627	0		_trans += (unsigned int)((_mid - _keys)>>1);
6628	0		goto _match;
6629			}
6630			}
6631	0		_trans += _klen;
6632			}
6633
6634			_match:
6635	0		_trans = _VBG_indicies[_trans];
6636	0		cs = _VBG_trans_targs[_trans];
6637
6638	0	0	if ( _VBG_trans_actions[_trans] == 0 )
6639			goto _again;
6640
6641	0		_acts = _VBG_actions + _VBG_trans_actions[_trans];
6642	0		_nacts = (unsigned int) *_acts++;
6643	0	0	while ( _nacts-- > 0 )
6644			{
6645	0		switch ( *_acts++ )
6646			{
6647			case 0:
6648	0	0	{ if (best > 'a') best = 'a', remove = 3, append = nullptr; }
6649			break;
6650			case 1:
6651	0	0	{ if (best > 'b') best = 'b', remove = 3, append = "e"; }
6652			break;
6653			case 2:
6654	0	0	{ if (best > 'c') best = 'c', remove = 3, append = nullptr; }
6655			break;
6656			case 3:
6657	0	0	{ if (best > 'd') best = 'd', remove = 3, append = "e"; }
6658			break;
6659			case 4:
6660	0	0	{ if (best > 'e') best = 'e', remove = 3, append = nullptr; }
6661			break;
6662			case 5:
6663	0	0	{ if (best > 'f') best = 'f', remove = 3, append = "e"; }
6664			break;
6665			case 6:
6666	0	0	{ if (best > 'g') best = 'g', remove = 3, append = nullptr; }
6667			break;
6668			case 7:
6669	0	0	{ if (best > 'h') best = 'h', remove = 3, append = "e"; }
6670			break;
6671			case 8:
6672	0	0	{ if (best > 'i') best = 'i', remove = 3, append = nullptr; }
6673			break;
6674			case 9:
6675	0	0	{ if (best > 'j') best = 'j', remove = 3, append = "e"; }
6676			break;
6677			case 10:
6678	0	0	{ if (best > 'k') best = 'k', remove = 3, append = nullptr; }
6679			break;
6680			case 11:
6681	0	0	{ if (best > 'l') best = 'l', remove = 3, append = "e"; }
6682			break;
6683			case 12:
6684	0	0	{ if (best > 'm') best = 'm', remove = 3, append = nullptr; }
6685			break;
6686			case 13:
6687	0	0	{ if (best > 'n') best = 'n', remove = 3, append = "e"; }
6688			break;
6689			case 14:
6690	0	0	{ if (best > 'o') best = 'o', remove = 3, append = nullptr; }
6691			break;
6692			case 15:
6693	0	0	{ if (best > 'p') best = 'p', remove = 3, append = "e"; }
6694			break;
6695			case 16:
6696	0	0	{ if (best > 'q') best = 'q', remove = 3, append = nullptr; }
6697			break;
6698			case 17:
6699	0	0	{ if (best > 'r') best = 'r', remove = 3, append = "e"; }
6700			break;
6701			}
6702			}
6703
6704			_again:
6705	0	0	if ( cs == 0 )
6706			goto _out;
6707	0	0	if ( ++p != ( (form.c_str() + form.size())) )
6708			goto _resume;
6709			_test_eof: {}
6710	0	0	if ( p == ( (form.c_str() + form.size())) )
6711			{
6712	0		const char *__acts = _VBG_actions + _VBG_eof_actions[cs];
6713	0		unsigned int __nacts = (unsigned int) *__acts++;
6714	0	0	while ( __nacts-- > 0 ) {
6715	0		switch ( *__acts++ ) {
6716			case 2:
6717	0	0	{ if (best > 'c') best = 'c', remove = 3, append = nullptr; }
6718			break;
6719			case 5:
6720	0	0	{ if (best > 'f') best = 'f', remove = 3, append = "e"; }
6721			break;
6722			case 15:
6723	0	0	{ if (best > 'p') best = 'p', remove = 3, append = "e"; }
6724			break;
6725			}
6726			}
6727			}
6728
6729			_out: {}
6730			}
6731
6732	0	0	add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
		0
6733	0		}
6734
6735			static const char _VBD_VBN_actions[] = {
6736			0, 1, 0, 1, 2, 1, 3, 1,
6737			4, 1, 5, 1, 6, 1, 7, 1,
6738			8, 1, 9, 1, 10, 1, 11, 1,
6739			13, 1, 14, 1, 15, 1, 16, 1,
6740			17, 2, 1, 16, 2, 4, 5, 2,
6741			8, 16, 2, 9, 13, 2, 9, 14,
6742			2, 12, 13, 2, 13, 14, 2, 15,
6743			16, 3, 1, 3, 16, 3, 3, 15,
6744			16
6745			};
6746
6747			static const short _VBD_VBN_key_offsets[] = {
6748			0, 0, 2, 3, 9, 14, 24, 29,
6749			34, 44, 46, 47, 48, 49, 50, 51,
6750			52, 60, 67, 74, 76, 77, 78, 79,
6751			80, 81, 82, 87, 95, 96, 97, 98,
6752			99, 100, 102, 103, 104, 105, 106, 107,
6753			108, 114, 115, 140, 140, 149, 150, 155,
6754			166, 175, 184, 194, 199, 204, 210, 220,
6755			220, 229, 241, 242, 253, 253, 262, 271,
6756			280, 289, 298, 303, 316, 327, 332, 338,
6757			348, 358, 369, 376, 387, 396, 405, 405,
6758			416, 427, 429, 430, 431, 431, 432, 440,
6759			451, 456, 462, 472, 482, 493, 500, 511,
6760			518, 524, 533, 542, 551
6761			};
6762
6763			static const char _VBD_VBN_trans_keys[] = {
6764			100, 110, 101, 97, 101, 105, 111, 117,
6765			121, 97, 101, 105, 111, 117, 98, 100,
6766			102, 104, 106, 110, 112, 116, 118, 122,
6767			97, 101, 105, 111, 117, 97, 101, 105,
6768			111, 117, 98, 100, 102, 104, 106, 110,
6769			112, 116, 118, 122, 98, 114, 105, 114,
6770			112, 105, 109, 101, 97, 101, 105, 111,
6771			117, 121, 98, 122, 97, 101, 105, 111,
6772			117, 98, 122, 97, 101, 105, 111, 117,
6773			98, 122, 98, 114, 105, 114, 112, 105,
6774			109, 101, 97, 101, 105, 111, 117, 97,
6775			101, 105, 110, 111, 115, 117, 120, 105,
6776			112, 105, 109, 101, 98, 114, 105, 114,
6777			112, 105, 109, 101, 97, 101, 105, 111,
6778			117, 121, 101, 98, 99, 100, 102, 103,
6779			104, 105, 106, 107, 108, 109, 110, 112,
6780			113, 114, 115, 116, 117, 118, 119, 120,
6781			121, 122, 97, 111, 97, 98, 101, 105,
6782			111, 117, 122, 99, 120, 113, 97, 101,
6783			105, 111, 117, 98, 99, 100, 105, 111,
6784			117, 122, 97, 101, 102, 120, 97, 100,
6785			101, 105, 111, 117, 122, 98, 120, 97,
6786			101, 102, 105, 111, 117, 122, 98, 120,
6787			97, 101, 103, 105, 110, 111, 117, 122,
6788			98, 120, 97, 101, 105, 111, 117, 101,
6789			110, 111, 115, 120, 101, 110, 111, 112,
6790			115, 120, 97, 101, 104, 105, 111, 116,
6791			117, 122, 98, 120, 97, 101, 105, 106,
6792			111, 117, 122, 98, 120, 98, 99, 100,
6793			105, 107, 111, 117, 122, 97, 101, 102,
6794			120, 105, 97, 101, 105, 108, 111, 114,
6795			117, 119, 122, 98, 120, 97, 101, 105,
6796			109, 111, 117, 122, 98, 120, 97, 101,
6797			105, 110, 111, 117, 122, 98, 120, 97,
6798			101, 105, 111, 112, 117, 122, 98, 120,
6799			97, 101, 105, 111, 113, 117, 122, 98,
6800			120, 97, 101, 105, 111, 114, 117, 122,
6801			98, 120, 97, 101, 105, 111, 117, 98,
6802			99, 100, 105, 108, 110, 111, 116, 117,
6803			97, 101, 102, 122, 101, 110, 111, 115,
6804			120, 98, 104, 106, 116, 118, 122, 101,
6805			110, 111, 115, 120, 101, 110, 111, 112,
6806			115, 120, 101, 105, 110, 111, 115, 120,
6807			98, 116, 118, 122, 101, 105, 110, 111,
6808			115, 120, 98, 116, 118, 122, 101, 110,
6809			111, 115, 120, 98, 104, 106, 116, 118,
6810			122, 98, 101, 110, 111, 114, 115, 120,
6811			101, 110, 111, 115, 120, 98, 104, 106,
6812			116, 118, 122, 97, 101, 105, 111, 115,
6813			117, 122, 98, 120, 97, 101, 105, 111,
6814			116, 117, 122, 98, 120, 122, 98, 100,
6815			102, 104, 106, 110, 112, 116, 118, 120,
6816			122, 98, 100, 102, 104, 106, 110, 112,
6817			116, 118, 120, 98, 114, 112, 114, 113,
6818			97, 101, 105, 108, 111, 117, 98, 122,
6819			101, 110, 111, 115, 120, 98, 104, 106,
6820			116, 118, 122, 101, 110, 111, 115, 120,
6821			101, 110, 111, 112, 115, 120, 101, 105,
6822			110, 111, 115, 120, 98, 116, 118, 122,
6823			101, 105, 110, 111, 115, 120, 98, 116,
6824			118, 122, 101, 110, 111, 115, 120, 98,
6825			104, 106, 116, 118, 122, 98, 101, 110,
6826			111, 114, 115, 120, 101, 110, 111, 115,
6827			120, 98, 104, 106, 116, 118, 122, 97,
6828			101, 105, 111, 117, 98, 122, 97, 101,
6829			105, 111, 117, 121, 97, 101, 105, 111,
6830			117, 118, 122, 98, 120, 97, 101, 105,
6831			111, 117, 119, 122, 98, 120, 97, 101,
6832			105, 111, 117, 120, 122, 98, 119, 97,
6833			101, 105, 111, 117, 121, 122, 98, 120,
6834			0
6835			};
6836
6837			static const char _VBD_VBN_single_lengths[] = {
6838			0, 2, 1, 6, 5, 0, 5, 5,
6839			0, 2, 1, 1, 1, 1, 1, 1,
6840			6, 5, 5, 2, 1, 1, 1, 1,
6841			1, 1, 5, 8, 1, 1, 1, 1,
6842			1, 2, 1, 1, 1, 1, 1, 1,
6843			6, 1, 23, 0, 7, 1, 5, 7,
6844			7, 7, 8, 5, 5, 6, 8, 0,
6845			7, 8, 1, 9, 0, 7, 7, 7,
6846			7, 7, 5, 9, 5, 5, 6, 6,
6847			6, 5, 7, 5, 7, 7, 0, 1,
6848			1, 2, 1, 1, 0, 1, 6, 5,
6849			5, 6, 6, 6, 5, 7, 5, 5,
6850			6, 7, 7, 7, 7
6851			};
6852
6853			static const char _VBD_VBN_range_lengths[] = {
6854			0, 0, 0, 0, 0, 5, 0, 0,
6855			5, 0, 0, 0, 0, 0, 0, 0,
6856			1, 1, 1, 0, 0, 0, 0, 0,
6857			0, 0, 0, 0, 0, 0, 0, 0,
6858			0, 0, 0, 0, 0, 0, 0, 0,
6859			0, 0, 1, 0, 1, 0, 0, 2,
6860			1, 1, 1, 0, 0, 0, 1, 0,
6861			1, 2, 0, 1, 0, 1, 1, 1,
6862			1, 1, 0, 2, 3, 0, 0, 2,
6863			2, 3, 0, 3, 1, 1, 0, 5,
6864			5, 0, 0, 0, 0, 0, 1, 3,
6865			0, 0, 2, 2, 3, 0, 3, 1,
6866			0, 1, 1, 1, 1
6867			};
6868
6869			static const short _VBD_VBN_index_offsets[] = {
6870			0, 0, 3, 5, 12, 18, 24, 30,
6871			36, 42, 45, 47, 49, 51, 53, 55,
6872			57, 65, 72, 79, 82, 84, 86, 88,
6873			90, 92, 94, 100, 109, 111, 113, 115,
6874			117, 119, 122, 124, 126, 128, 130, 132,
6875			134, 141, 143, 168, 169, 178, 180, 186,
6876			196, 205, 214, 224, 230, 236, 243, 253,
6877			254, 263, 274, 276, 287, 288, 297, 306,
6878			315, 324, 333, 339, 351, 360, 366, 373,
6879			382, 391, 400, 408, 417, 426, 435, 436,
6880			443, 450, 453, 455, 457, 458, 460, 468,
6881			477, 483, 490, 499, 508, 517, 525, 534,
6882			541, 548, 557, 566, 575
6883			};
6884
6885			static const unsigned char _VBD_VBN_indicies[] = {
6886			0, 2, 1, 3, 1, 4, 4, 4,
6887			4, 4, 4, 1, 5, 5, 5, 5,
6888			6, 1, 7, 7, 7, 7, 7, 1,
6889			8, 8, 8, 8, 9, 1, 5, 5,
6890			5, 5, 10, 1, 11, 11, 11, 11,
6891			11, 1, 11, 12, 1, 11, 1, 13,
6892			1, 11, 1, 14, 1, 11, 1, 11,
6893			1, 4, 4, 4, 4, 4, 16, 15,
6894			1, 5, 5, 5, 5, 6, 17, 1,
6895			5, 5, 5, 5, 6, 18, 1, 19,
6896			20, 1, 19, 1, 21, 1, 19, 1,
6897			22, 1, 19, 1, 19, 1, 23, 24,
6898			23, 25, 26, 1, 27, 28, 27, 29,
6899			30, 31, 27, 32, 1, 33, 1, 33,
6900			1, 34, 1, 33, 1, 33, 1, 35,
6901			36, 1, 35, 1, 37, 1, 35, 1,
6902			38, 1, 35, 1, 35, 1, 39, 39,
6903			39, 39, 39, 4, 1, 40, 1, 42,
6904			43, 44, 45, 46, 47, 48, 49, 50,
6905			51, 52, 53, 54, 55, 56, 57, 58,
6906			59, 60, 61, 62, 63, 64, 41, 1,
6907			1, 65, 66, 65, 65, 65, 65, 4,
6908			4, 1, 67, 1, 68, 68, 68, 68,
6909			68, 1, 70, 71, 70, 69, 69, 69,
6910			70, 69, 70, 1, 72, 66, 72, 72,
6911			72, 72, 4, 4, 1, 65, 65, 66,
6912			65, 65, 65, 4, 4, 1, 69, 69,
6913			71, 69, 73, 69, 69, 70, 70, 1,
6914			74, 74, 74, 74, 74, 1, 75, 76,
6915			77, 78, 79, 1, 75, 76, 77, 11,
6916			78, 79, 1, 65, 65, 66, 65, 65,
6917			80, 65, 4, 4, 1, 81, 65, 65,
6918			65, 66, 65, 65, 4, 4, 1, 4,
6919			82, 4, 65, 66, 65, 65, 4, 65,
6920			4, 1, 7, 1, 65, 65, 65, 71,
6921			65, 83, 65, 83, 70, 70, 1, 5,
6922			65, 65, 65, 66, 65, 65, 4, 4,
6923			1, 84, 84, 85, 66, 84, 84, 4,
6924			4, 1, 84, 84, 84, 84, 66, 84,
6925			4, 4, 1, 65, 65, 65, 65, 66,
6926			65, 4, 4, 1, 65, 86, 65, 87,
6927			66, 65, 4, 4, 1, 5, 5, 5,
6928			5, 6, 1, 88, 89, 88, 5, 89,
6929			89, 5, 89, 6, 5, 88, 1, 90,
6930			91, 92, 93, 94, 88, 88, 88, 1,
6931			90, 95, 92, 96, 97, 1, 90, 95,
6932			92, 19, 96, 97, 1, 90, 19, 91,
6933			92, 93, 94, 88, 88, 1, 90, 22,
6934			91, 92, 93, 94, 88, 88, 1, 98,
6935			91, 92, 93, 94, 88, 88, 88, 1,
6936			19, 90, 95, 92, 20, 96, 97, 1,
6937			90, 100, 92, 101, 102, 99, 99, 99,
6938			1, 69, 69, 69, 69, 103, 69, 70,
6939			70, 1, 104, 105, 106, 65, 66, 65,
6940			4, 4, 1, 107, 109, 109, 109, 109,
6941			109, 109, 108, 110, 110, 110, 110, 110,
6942			110, 1, 33, 111, 1, 33, 1, 112,
6943			1, 108, 113, 107, 5, 5, 5, 115,
6944			5, 6, 114, 1, 116, 117, 118, 119,
6945			120, 114, 114, 114, 1, 116, 121, 118,
6946			122, 123, 1, 116, 121, 118, 35, 122,
6947			123, 1, 116, 35, 117, 118, 119, 120,
6948			114, 114, 1, 116, 38, 117, 118, 119,
6949			120, 114, 114, 1, 124, 117, 118, 119,
6950			120, 114, 114, 114, 1, 35, 116, 121,
6951			118, 36, 122, 123, 1, 116, 126, 118,
6952			127, 128, 125, 125, 125, 1, 5, 5,
6953			5, 5, 6, 114, 1, 4, 4, 4,
6954			4, 4, 4, 1, 69, 69, 69, 69,
6955			69, 71, 70, 70, 1, 84, 84, 84,
6956			84, 84, 66, 4, 4, 1, 84, 84,
6957			84, 84, 84, 66, 4, 4, 1, 129,
6958			129, 129, 129, 129, 131, 132, 130, 1,
6959			0
6960			};
6961
6962			static const char _VBD_VBN_trans_targs[] = {
6963			2, 0, 41, 42, 43, 43, 45, 43,
6964			43, 45, 45, 52, 53, 12, 14, 43,
6965			43, 43, 43, 69, 70, 22, 24, 78,
6966			79, 84, 85, 43, 81, 28, 83, 30,
6967			32, 43, 31, 88, 89, 36, 38, 66,
6968			43, 3, 44, 47, 48, 49, 50, 54,
6969			16, 56, 57, 59, 61, 62, 63, 64,
6970			65, 76, 77, 96, 97, 98, 99, 40,
6971			100, 4, 46, 43, 5, 6, 43, 46,
6972			7, 51, 8, 9, 10, 11, 13, 15,
6973			55, 43, 58, 60, 17, 18, 66, 67,
6974			68, 75, 19, 71, 21, 72, 73, 20,
6975			23, 25, 74, 68, 71, 72, 73, 46,
6976			26, 86, 95, 43, 43, 80, 27, 82,
6977			29, 43, 87, 94, 33, 90, 35, 91,
6978			92, 34, 37, 39, 93, 87, 90, 91,
6979			92, 66, 43, 43, 46
6980			};
6981
6982			static const char _VBD_VBN_trans_actions[] = {
6983			0, 0, 0, 31, 29, 25, 25, 5,
6984			51, 51, 45, 0, 0, 0, 0, 15,
6985			39, 9, 36, 0, 0, 0, 0, 25,
6986			25, 25, 25, 21, 21, 0, 21, 0,
6987			0, 19, 0, 0, 0, 0, 0, 29,
6988			1, 0, 0, 0, 0, 0, 0, 0,
6989			0, 0, 0, 0, 0, 0, 0, 0,
6990			0, 0, 0, 27, 0, 0, 0, 0,
6991			0, 0, 29, 17, 0, 0, 54, 54,
6992			0, 54, 0, 0, 0, 0, 0, 0,
6993			29, 27, 29, 54, 0, 0, 13, 13,
6994			0, 0, 0, 0, 0, 0, 0, 0,
6995			0, 0, 0, 7, 7, 7, 7, 61,
6996			0, 19, 19, 23, 48, 48, 0, 19,
6997			0, 42, 0, 0, 0, 0, 0, 0,
6998			0, 0, 0, 0, 0, 17, 17, 17,
6999			17, 3, 33, 3, 57
7000			};
7001
7002			static const char _VBD_VBN_eof_actions[] = {
7003			0, 0, 0, 0, 0, 0, 0, 0,
7004			0, 0, 0, 0, 0, 0, 0, 0,
7005			0, 0, 0, 0, 0, 0, 0, 0,
7006			0, 0, 0, 0, 0, 0, 0, 0,
7007			0, 0, 0, 0, 0, 0, 0, 0,
7008			0, 0, 0, 0, 5, 0, 0, 5,
7009			5, 5, 5, 0, 5, 5, 5, 0,
7010			5, 5, 0, 5, 0, 5, 5, 5,
7011			5, 5, 0, 0, 11, 11, 11, 11,
7012			11, 11, 11, 11, 5, 5, 0, 0,
7013			0, 0, 0, 0, 0, 0, 0, 17,
7014			17, 17, 17, 17, 17, 17, 17, 0,
7015			0, 5, 5, 5, 5
7016			};
7017
7018			static const int VBD_VBN_start = 1;
7019
7020	0		void english_morpho_guesser::add_VBD_VBN(const string& form, vector& lemmas) const {
7021			const char* p = form.c_str(); int cs;
7022			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
7023
7024			{
7025			cs = VBD_VBN_start;
7026			}
7027
7028			{
7029			int _klen;
7030			unsigned int _trans;
7031			const char *_acts;
7032			unsigned int _nacts;
7033			const char *_keys;
7034
7035	0	0	if ( p == ( (form.c_str() + form.size())) )
7036			goto _test_eof;
7037			if ( cs == 0 )
7038			goto _out;
7039			_resume:
7040	0		_keys = _VBD_VBN_trans_keys + _VBD_VBN_key_offsets[cs];
7041	0		_trans = _VBD_VBN_index_offsets[cs];
7042
7043	0		_klen = _VBD_VBN_single_lengths[cs];
7044	0	0	if ( _klen > 0 ) {
7045			const char *_lower = _keys;
7046			const char *_mid;
7047	0		const char *_upper = _keys + _klen - 1;
7048			while (1) {
7049	0	0	if ( _upper < _lower )
7050			break;
7051
7052	0		_mid = _lower + ((_upper-_lower) >> 1);
7053	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
7054	0		_upper = _mid - 1;
7055	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
7056	0		_lower = _mid + 1;
7057			else {
7058	0		_trans += (unsigned int)(_mid - _keys);
7059	0		goto _match;
7060			}
7061			}
7062	0		_keys += _klen;
7063	0		_trans += _klen;
7064			}
7065
7066	0		_klen = _VBD_VBN_range_lengths[cs];
7067	0	0	if ( _klen > 0 ) {
7068			const char *_lower = _keys;
7069			const char *_mid;
7070	0		const char *_upper = _keys + (_klen<<1) - 2;
7071			while (1) {
7072	0	0	if ( _upper < _lower )
7073			break;
7074
7075	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
7076	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
7077	0		_upper = _mid - 2;
7078	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
7079	0		_lower = _mid + 2;
7080			else {
7081	0		_trans += (unsigned int)((_mid - _keys)>>1);
7082	0		goto _match;
7083			}
7084			}
7085	0		_trans += _klen;
7086			}
7087
7088			_match:
7089	0		_trans = _VBD_VBN_indicies[_trans];
7090	0		cs = _VBD_VBN_trans_targs[_trans];
7091
7092	0	0	if ( _VBD_VBN_trans_actions[_trans] == 0 )
7093			goto _again;
7094
7095	0		_acts = _VBD_VBN_actions + _VBD_VBN_trans_actions[_trans];
7096	0		_nacts = (unsigned int) *_acts++;
7097	0	0	while ( _nacts-- > 0 )
7098			{
7099	0		switch ( *_acts++ )
7100			{
7101			case 0:
7102	0	0	{ if (best > 'a') best = 'a', remove = 1, append = nullptr; }
7103			break;
7104			case 1:
7105	0	0	{ if (best > 'b') best = 'b', remove = 2, append = nullptr; }
7106			break;
7107			case 2:
7108	0	0	{ if (best > 'c') best = 'c', remove = 1, append = nullptr; }
7109			break;
7110			case 3:
7111	0	0	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7112			break;
7113			case 4:
7114	0	0	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7115			break;
7116			case 5:
7117	0	0	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7118			break;
7119			case 7:
7120	0	0	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
7121			break;
7122			case 8:
7123	0	0	{ if (best > 'i') best = 'i', remove = 3, append = "y"; }
7124			break;
7125			case 9:
7126	0	0	{ if (best > 'j') best = 'j', remove = 1, append = nullptr; }
7127			break;
7128			case 10:
7129	0	0	{ if (best > 'k') best = 'k', remove = 2, append = nullptr; }
7130			break;
7131			case 11:
7132	0	0	{ if (best > 'l') best = 'l', remove = 1, append = nullptr; }
7133			break;
7134			case 12:
7135	0	0	{ if (best > 'm') best = 'm', remove = 2, append = nullptr; }
7136			break;
7137			case 13:
7138	0	0	{ if (best > 'n') best = 'n', remove = 1, append = nullptr; }
7139			break;
7140			case 14:
7141	0	0	{ if (best > 'o') best = 'o', remove = 2, append = nullptr; }
7142			break;
7143			case 15:
7144	0	0	{ if (best > 'p') best = 'p', remove = 1, append = nullptr; }
7145			break;
7146			case 16:
7147	0	0	{ if (best > 'q') best = 'q', remove = 2, append = nullptr; }
7148			break;
7149			case 17:
7150	0	0	{ if (best > 'r') best = 'r', remove = 1, append = nullptr; }
7151			break;
7152			}
7153			}
7154
7155			_again:
7156	0	0	if ( cs == 0 )
7157			goto _out;
7158	0	0	if ( ++p != ( (form.c_str() + form.size())) )
7159			goto _resume;
7160			_test_eof: {}
7161	0	0	if ( p == ( (form.c_str() + form.size())) )
7162			{
7163	0		const char *__acts = _VBD_VBN_actions + _VBD_VBN_eof_actions[cs];
7164	0		unsigned int __nacts = (unsigned int) *__acts++;
7165	0	0	while ( __nacts-- > 0 ) {
7166	0		switch ( *__acts++ ) {
7167			case 3:
7168	0	0	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7169			break;
7170			case 6:
7171	0	0	{ if (best > 'g') best = 'g', remove = 1, append = nullptr; }
7172			break;
7173			case 9:
7174	0	0	{ if (best > 'j') best = 'j', remove = 1, append = nullptr; }
7175			break;
7176			}
7177			}
7178			}
7179
7180			_out: {}
7181			}
7182
7183	0	0	add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
		0
7184	0		}
7185
7186			static const char _VBZ_actions[] = {
7187			0, 1, 0, 1, 1, 1, 2, 1,
7188			3, 1, 4, 1, 5, 1, 6, 1,
7189			7, 1, 8
7190			};
7191
7192			static const char _VBZ_key_offsets[] = {
7193			0, 0, 1, 2, 4, 14, 14, 25,
7194			26, 31, 31, 31, 31, 37, 45, 54
7195			};
7196
7197			static const char _VBZ_trans_keys[] = {
7198			115, 101, 99, 115, 98, 100, 102, 104,
7199			106, 110, 112, 116, 118, 122, 122, 98,
7200			100, 102, 104, 106, 110, 112, 116, 118,
7201			120, 111, 97, 101, 105, 111, 117, 104,
7202			105, 111, 115, 120, 122, 97, 101, 105,
7203			110, 111, 114, 115, 117, 97, 101, 105,
7204			111, 117, 121, 122, 98, 120, 0
7205			};
7206
7207			static const char _VBZ_single_lengths[] = {
7208			0, 1, 1, 2, 0, 0, 1, 1,
7209			5, 0, 0, 0, 6, 8, 7, 0
7210			};
7211
7212			static const char _VBZ_range_lengths[] = {
7213			0, 0, 0, 0, 5, 0, 5, 0,
7214			0, 0, 0, 0, 0, 0, 1, 0
7215			};
7216
7217			static const char _VBZ_index_offsets[] = {
7218			0, 0, 2, 4, 7, 13, 14, 21,
7219			23, 29, 30, 31, 32, 39, 48, 57
7220			};
7221
7222			static const char _VBZ_indicies[] = {
7223			0, 1, 3, 2, 4, 4, 1, 5,
7224			5, 5, 5, 5, 1, 6, 7, 7,
7225			7, 7, 7, 7, 1, 8, 1, 9,
7226			9, 9, 9, 9, 1, 8, 10, 1,
7227			11, 12, 13, 14, 4, 15, 1, 16,
7228			16, 16, 17, 16, 18, 19, 16, 1,
7229			20, 20, 20, 20, 20, 20, 22, 21,
7230			1, 10, 0
7231			};
7232
7233			static const char _VBZ_trans_targs[] = {
7234			2, 0, 11, 12, 11, 5, 11, 11,
7235			11, 9, 11, 3, 4, 6, 13, 14,
7236			11, 7, 8, 11, 11, 10, 15
7237			};
7238
7239			static const char _VBZ_trans_actions[] = {
7240			0, 0, 17, 17, 11, 0, 13, 15,
7241			9, 0, 3, 0, 0, 0, 11, 11,
7242			1, 0, 0, 7, 5, 0, 7
7243			};
7244
7245			static const int VBZ_start = 1;
7246
7247	0		void english_morpho_guesser::add_VBZ(const string& form, vector& lemmas) const {
7248			const char* p = form.c_str(); int cs;
7249			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
7250
7251			{
7252			cs = VBZ_start;
7253			}
7254
7255			{
7256			int _klen;
7257			unsigned int _trans;
7258			const char *_acts;
7259			unsigned int _nacts;
7260			const char *_keys;
7261
7262	0	0	if ( p == ( (form.c_str() + form.size())) )
7263			goto _test_eof;
7264			if ( cs == 0 )
7265			goto _out;
7266			_resume:
7267	0		_keys = _VBZ_trans_keys + _VBZ_key_offsets[cs];
7268	0		_trans = _VBZ_index_offsets[cs];
7269
7270	0		_klen = _VBZ_single_lengths[cs];
7271	0	0	if ( _klen > 0 ) {
7272			const char *_lower = _keys;
7273			const char *_mid;
7274	0		const char *_upper = _keys + _klen - 1;
7275			while (1) {
7276	0	0	if ( _upper < _lower )
7277			break;
7278
7279	0		_mid = _lower + ((_upper-_lower) >> 1);
7280	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid )
7281	0		_upper = _mid - 1;
7282	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid )
7283	0		_lower = _mid + 1;
7284			else {
7285	0		_trans += (unsigned int)(_mid - _keys);
7286	0		goto _match;
7287			}
7288			}
7289	0		_keys += _klen;
7290	0		_trans += _klen;
7291			}
7292
7293	0		_klen = _VBZ_range_lengths[cs];
7294	0	0	if ( _klen > 0 ) {
7295			const char *_lower = _keys;
7296			const char *_mid;
7297	0		const char *_upper = _keys + (_klen<<1) - 2;
7298			while (1) {
7299	0	0	if ( _upper < _lower )
7300			break;
7301
7302	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
7303	0	0	if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] )
7304	0		_upper = _mid - 2;
7305	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] )
7306	0		_lower = _mid + 2;
7307			else {
7308	0		_trans += (unsigned int)((_mid - _keys)>>1);
7309	0		goto _match;
7310			}
7311			}
7312	0		_trans += _klen;
7313			}
7314
7315			_match:
7316	0		_trans = _VBZ_indicies[_trans];
7317	0		cs = _VBZ_trans_targs[_trans];
7318
7319	0	0	if ( _VBZ_trans_actions[_trans] == 0 )
7320			goto _again;
7321
7322	0		_acts = _VBZ_actions + _VBZ_trans_actions[_trans];
7323	0		_nacts = (unsigned int) *_acts++;
7324	0	0	while ( _nacts-- > 0 )
7325			{
7326	0		switch ( *_acts++ )
7327			{
7328			case 0:
7329	0	0	{ if (best > 'a') best = 'a', remove = 1, append = nullptr; }
7330			break;
7331			case 1:
7332	0	0	{ if (best > 'b') best = 'b', remove = 2, append = nullptr; }
7333			break;
7334			case 2:
7335	0	0	{ if (best > 'c') best = 'c', remove = 1, append = nullptr; }
7336			break;
7337			case 3:
7338	0	0	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7339			break;
7340			case 4:
7341	0	0	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7342			break;
7343			case 5:
7344	0	0	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7345			break;
7346			case 6:
7347	0	0	{ if (best > 'g') best = 'g', remove = 3, append = "y"; }
7348			break;
7349			case 7:
7350	0	0	{ if (best > 'h') best = 'h', remove = 2, append = nullptr; }
7351			break;
7352			case 8:
7353	0	0	{ if (best > 'i') best = 'i', remove = 1, append = nullptr; }
7354			break;
7355			}
7356			}
7357
7358			_again:
7359	0	0	if ( cs == 0 )
7360			goto _out;
7361	0	0	if ( ++p != ( (form.c_str() + form.size())) )
7362			goto _resume;
7363			_test_eof: {}
7364			_out: {}
7365			}
7366
7367	0	0	add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas);
		0
7368	0		}
7369
7370			static const char _JJR_RBR_actions[] = {
7371			0, 1, 0, 1, 1, 1, 3, 1,
7372			4, 1, 5, 2, 1, 4, 2, 2,
7373			5, 2, 4, 5
7374			};
7375
7376			static const unsigned char _JJR_RBR_key_offsets[] = {
7377			0, 0, 1, 2, 26, 26, 32, 37,
7378			50, 56, 62, 73, 79, 85, 91, 102,
7379			103, 109, 115, 117, 123, 129, 135, 146,
7380			152, 163, 169, 175, 181
7381			};
7382
7383			static const char _JJR_RBR_trans_keys[] = {
7384			114, 101, 98, 99, 100, 101, 102, 103,
7385			104, 105, 106, 107, 108, 109, 110, 112,
7386			113, 114, 115, 116, 117, 118, 119, 120,
7387			121, 122, 97, 98, 101, 105, 111, 117,
7388			97, 101, 105, 111, 117, 98, 99, 100,
7389			105, 111, 117, 122, 97, 101, 102, 109,
7390			112, 120, 97, 100, 101, 105, 111, 117,
7391			97, 101, 102, 105, 111, 117, 97, 101,
7392			103, 105, 111, 117, 122, 98, 109, 112,
7393			120, 97, 101, 104, 105, 111, 117, 97,
7394			101, 105, 106, 111, 117, 97, 101, 105,
7395			107, 111, 117, 97, 101, 105, 108, 111,
7396			117, 122, 98, 109, 112, 120, 101, 97,
7397			101, 105, 109, 111, 117, 97, 101, 105,
7398			110, 111, 117, 97, 122, 97, 101, 105,
7399			111, 112, 117, 97, 101, 105, 111, 113,
7400			117, 97, 101, 105, 111, 114, 117, 97,
7401			101, 105, 111, 115, 117, 122, 98, 109,
7402			112, 120, 97, 101, 105, 111, 116, 117,
7403			97, 101, 105, 111, 117, 118, 122, 98,
7404			109, 112, 120, 97, 101, 105, 111, 117,
7405			119, 97, 101, 105, 111, 117, 120, 97,
7406			101, 105, 111, 117, 121, 97, 101, 105,
7407			111, 117, 122, 0
7408			};
7409
7410			static const char _JJR_RBR_single_lengths[] = {
7411			0, 1, 1, 24, 0, 6, 5, 7,
7412			6, 6, 7, 6, 6, 6, 7, 1,
7413			6, 6, 0, 6, 6, 6, 7, 6,
7414			7, 6, 6, 6, 6
7415			};
7416
7417			static const char _JJR_RBR_range_lengths[] = {
7418			0, 0, 0, 0, 0, 0, 0, 3,
7419			0, 0, 2, 0, 0, 0, 2, 0,
7420			0, 0, 1, 0, 0, 0, 2, 0,
7421			2, 0, 0, 0, 0
7422			};
7423
7424			static const unsigned char _JJR_RBR_index_offsets[] = {
7425			0, 0, 2, 4, 29, 30, 37, 43,
7426			54, 61, 68, 78, 85, 92, 99, 109,
7427			111, 118, 125, 127, 134, 141, 148, 158,
7428			165, 175, 182, 189, 196
7429			};
7430
7431			static const char _JJR_RBR_indicies[] = {
7432			0, 1, 2, 1, 4, 5, 6, 7,
7433			8, 9, 10, 11, 12, 13, 14, 15,
7434			16, 17, 18, 19, 20, 21, 7, 22,
7435			23, 24, 25, 26, 3, 1, 27, 28,
7436			27, 27, 27, 27, 1, 29, 29, 29,
7437			29, 29, 1, 30, 31, 30, 27, 27,
7438			27, 30, 27, 30, 30, 1, 27, 28,
7439			27, 27, 27, 27, 1, 27, 27, 28,
7440			27, 27, 27, 1, 27, 27, 31, 27,
7441			27, 27, 30, 30, 30, 1, 27, 27,
7442			28, 27, 27, 27, 1, 27, 27, 27,
7443			28, 27, 27, 1, 27, 27, 27, 28,
7444			27, 27, 1, 27, 27, 27, 32, 27,
7445			27, 30, 30, 30, 1, 1, 33, 27,
7446			27, 27, 28, 27, 27, 1, 34, 34,
7447			34, 28, 34, 34, 1, 29, 1, 34,
7448			34, 34, 34, 28, 34, 1, 27, 27,
7449			27, 27, 28, 27, 1, 27, 27, 27,
7450			27, 28, 27, 1, 27, 27, 27, 27,
7451			31, 27, 30, 30, 30, 1, 27, 27,
7452			27, 27, 28, 27, 1, 27, 27, 27,
7453			27, 27, 31, 30, 30, 30, 1, 34,
7454			34, 34, 34, 34, 28, 1, 34, 34,
7455			34, 34, 34, 28, 1, 27, 27, 27,
7456			27, 27, 28, 1, 27, 27, 27, 27,
7457			27, 28, 1, 0
7458			};
7459
7460			static const char _JJR_RBR_trans_targs[] = {
7461			2, 0, 3, 4, 5, 7, 8, 4,
7462			9, 10, 11, 4, 12, 13, 14, 16,
7463			17, 19, 20, 21, 22, 23, 24, 25,
7464			26, 27, 28, 6, 4, 4, 4, 4,
7465			15, 4, 18
7466			};
7467
7468			static const char _JJR_RBR_trans_actions[] = {
7469			0, 0, 0, 9, 9, 9, 9, 17,
7470			9, 9, 9, 14, 9, 9, 9, 9,
7471			9, 9, 9, 9, 9, 9, 9, 9,
7472			9, 9, 9, 7, 3, 5, 7, 11,
7473			11, 1, 7
7474			};
7475
7476			static const int JJR_RBR_start = 1;
7477
7478	0		void english_morpho_guesser::add_JJR_RBR(const string& form, unsigned negation_len, vector& lemmas) const {
7479	0		const char* p = form.c_str() + negation_len; int cs;
7480			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
7481
7482			{
7483			cs = JJR_RBR_start;
7484			}
7485
7486			{
7487			int _klen;
7488			unsigned int _trans;
7489			const char *_acts;
7490			unsigned int _nacts;
7491			const char *_keys;
7492
7493	0	0	if ( p == ( (form.c_str() + form.size())) )
7494			goto _test_eof;
7495			if ( cs == 0 )
7496			goto _out;
7497			_resume:
7498	0		_keys = _JJR_RBR_trans_keys + _JJR_RBR_key_offsets[cs];
7499	0		_trans = _JJR_RBR_index_offsets[cs];
7500
7501	0		_klen = _JJR_RBR_single_lengths[cs];
7502	0	0	if ( _klen > 0 ) {
7503			const char *_lower = _keys;
7504			const char *_mid;
7505	0		const char *_upper = _keys + _klen - 1;
7506			while (1) {
7507	0	0	if ( _upper < _lower )
7508			break;
7509
7510	0		_mid = _lower + ((_upper-_lower) >> 1);
7511	0	0	if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
7512	0		_upper = _mid - 1;
7513	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
7514	0		_lower = _mid + 1;
7515			else {
7516	0		_trans += (unsigned int)(_mid - _keys);
7517	0		goto _match;
7518			}
7519			}
7520	0		_keys += _klen;
7521	0		_trans += _klen;
7522			}
7523
7524	0		_klen = _JJR_RBR_range_lengths[cs];
7525	0	0	if ( _klen > 0 ) {
7526			const char *_lower = _keys;
7527			const char *_mid;
7528	0		const char *_upper = _keys + (_klen<<1) - 2;
7529			while (1) {
7530	0	0	if ( _upper < _lower )
7531			break;
7532
7533	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
7534	0	0	if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
7535	0		_upper = _mid - 2;
7536	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
7537	0		_lower = _mid + 2;
7538			else {
7539	0		_trans += (unsigned int)((_mid - _keys)>>1);
7540	0		goto _match;
7541			}
7542			}
7543	0		_trans += _klen;
7544			}
7545
7546			_match:
7547	0		_trans = _JJR_RBR_indicies[_trans];
7548	0		cs = _JJR_RBR_trans_targs[_trans];
7549
7550	0	0	if ( _JJR_RBR_trans_actions[_trans] == 0 )
7551			goto _again;
7552
7553	0		_acts = _JJR_RBR_actions + _JJR_RBR_trans_actions[_trans];
7554	0		_nacts = (unsigned int) *_acts++;
7555	0	0	while ( _nacts-- > 0 )
7556			{
7557	0		switch ( *_acts++ )
7558			{
7559			case 0:
7560	0	0	{ if (best > 'a') best = 'a', remove = 2, append = nullptr; }
7561			break;
7562			case 1:
7563	0	0	{ if (best > 'b') best = 'b', remove = 3, append = nullptr; }
7564			break;
7565			case 2:
7566	0	0	{ if (best > 'c') best = 'c', remove = 3, append = "y"; }
7567			break;
7568			case 3:
7569	0	0	{ if (best > 'd') best = 'd', remove = 2, append = nullptr; }
7570			break;
7571			case 4:
7572	0	0	{ if (best > 'e') best = 'e', remove = 1, append = nullptr; }
7573			break;
7574			case 5:
7575	0	0	{ if (best > 'f') best = 'f', remove = 2, append = nullptr; }
7576			break;
7577			}
7578			}
7579
7580			_again:
7581	0	0	if ( cs == 0 )
7582			goto _out;
7583	0	0	if ( ++p != ( (form.c_str() + form.size())) )
7584			goto _resume;
7585			_test_eof: {}
7586			_out: {}
7587			}
7588
7589	0	0	add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
		0
		0
7590	0		}
7591
7592			static const char _JJS_RBS_actions[] = {
7593			0, 1, 1, 1, 2, 1, 4, 1,
7594			5, 2, 0, 5, 2, 1, 4, 2,
7595			3, 5
7596			};
7597
7598			static const unsigned char _JJS_RBS_key_offsets[] = {
7599			0, 0, 1, 2, 3, 25, 25, 25,
7600			31, 44, 50, 56, 67, 73, 79, 85,
7601			96, 102, 108, 114, 120, 126, 137, 143,
7602			154, 160, 166, 172, 178, 178, 183, 183,
7603			183, 184
7604			};
7605
7606			static const char _JJS_RBS_trans_keys[] = {
7607			116, 115, 101, 98, 99, 100, 102, 103,
7608			104, 105, 106, 107, 108, 109, 110, 112,
7609			113, 114, 115, 116, 118, 119, 120, 121,
7610			122, 97, 98, 101, 105, 111, 117, 98,
7611			99, 100, 105, 111, 117, 122, 97, 101,
7612			102, 109, 112, 120, 97, 100, 101, 105,
7613			111, 117, 97, 101, 102, 105, 111, 117,
7614			97, 101, 103, 105, 111, 117, 122, 98,
7615			109, 112, 120, 97, 101, 104, 105, 111,
7616			117, 97, 101, 105, 106, 111, 117, 97,
7617			101, 105, 107, 111, 117, 97, 101, 105,
7618			108, 111, 117, 122, 98, 109, 112, 120,
7619			97, 101, 105, 109, 111, 117, 97, 101,
7620			105, 110, 111, 117, 97, 101, 105, 111,
7621			112, 117, 97, 101, 105, 111, 113, 117,
7622			97, 101, 105, 111, 114, 117, 97, 101,
7623			105, 111, 115, 117, 122, 98, 109, 112,
7624			120, 97, 101, 105, 111, 116, 117, 97,
7625			101, 105, 111, 117, 118, 122, 98, 109,
7626			112, 120, 97, 101, 105, 111, 117, 119,
7627			97, 101, 105, 111, 117, 120, 97, 101,
7628			105, 111, 117, 121, 97, 101, 105, 111,
7629			117, 122, 97, 101, 105, 111, 117, 101,
7630			97, 122, 0
7631			};
7632
7633			static const char _JJS_RBS_single_lengths[] = {
7634			0, 1, 1, 1, 22, 0, 0, 6,
7635			7, 6, 6, 7, 6, 6, 6, 7,
7636			6, 6, 6, 6, 6, 7, 6, 7,
7637			6, 6, 6, 6, 0, 5, 0, 0,
7638			1, 0
7639			};
7640
7641			static const char _JJS_RBS_range_lengths[] = {
7642			0, 0, 0, 0, 0, 0, 0, 0,
7643			3, 0, 0, 2, 0, 0, 0, 2,
7644			0, 0, 0, 0, 0, 2, 0, 2,
7645			0, 0, 0, 0, 0, 0, 0, 0,
7646			0, 1
7647			};
7648
7649			static const unsigned char _JJS_RBS_index_offsets[] = {
7650			0, 0, 2, 4, 6, 29, 30, 31,
7651			38, 49, 56, 63, 73, 80, 87, 94,
7652			104, 111, 118, 125, 132, 139, 149, 156,
7653			166, 173, 180, 187, 194, 195, 201, 202,
7654			203, 205
7655			};
7656
7657			static const char _JJS_RBS_indicies[] = {
7658			0, 1, 2, 1, 3, 1, 5, 6,
7659			7, 8, 9, 10, 11, 12, 13, 14,
7660			15, 16, 17, 18, 19, 20, 21, 22,
7661			23, 24, 25, 26, 4, 27, 28, 29,
7662			30, 29, 29, 29, 29, 27, 31, 32,
7663			31, 29, 29, 29, 31, 29, 31, 31,
7664			27, 29, 30, 29, 29, 29, 29, 27,
7665			29, 29, 30, 29, 29, 29, 27, 29,
7666			29, 32, 29, 29, 29, 31, 31, 31,
7667			27, 29, 29, 30, 29, 29, 29, 27,
7668			29, 29, 29, 30, 29, 29, 27, 29,
7669			29, 29, 30, 29, 29, 27, 29, 29,
7670			29, 33, 29, 29, 31, 31, 31, 27,
7671			29, 29, 29, 30, 29, 29, 27, 34,
7672			34, 34, 30, 34, 34, 27, 34, 34,
7673			34, 34, 30, 34, 27, 29, 29, 29,
7674			29, 30, 29, 27, 29, 29, 29, 29,
7675			30, 29, 27, 29, 29, 29, 29, 32,
7676			29, 31, 31, 31, 27, 29, 29, 29,
7677			29, 30, 29, 27, 29, 29, 29, 29,
7678			29, 32, 31, 31, 31, 27, 34, 34,
7679			34, 34, 34, 30, 27, 34, 34, 34,
7680			34, 34, 30, 27, 29, 29, 29, 29,
7681			29, 30, 27, 29, 29, 29, 29, 29,
7682			30, 27, 1, 35, 35, 35, 35, 35,
7683			28, 28, 27, 28, 36, 35, 28, 0
7684			};
7685
7686			static const char _JJS_RBS_trans_targs[] = {
7687			2, 0, 3, 4, 5, 7, 8, 9,
7688			10, 11, 12, 31, 13, 14, 15, 16,
7689			17, 18, 19, 20, 21, 22, 23, 24,
7690			25, 26, 27, 6, 28, 29, 30, 30,
7691			30, 32, 33, 28, 28
7692			};
7693
7694			static const char _JJS_RBS_trans_actions[] = {
7695			0, 0, 0, 0, 0, 0, 0, 0,
7696			0, 0, 0, 3, 0, 0, 0, 0,
7697			0, 0, 0, 0, 0, 0, 0, 0,
7698			0, 0, 0, 0, 7, 5, 1, 5,
7699			12, 12, 5, 15, 9
7700			};
7701
7702			static const int JJS_RBS_start = 1;
7703
7704	0		void english_morpho_guesser::add_JJS_RBS(const string& form, unsigned negation_len, vector& lemmas) const {
7705	0		const char* p = form.c_str() + negation_len; int cs;
7706			char best = 'z'; unsigned remove = 0; const char* append = nullptr;
7707
7708			{
7709			cs = JJS_RBS_start;
7710			}
7711
7712			{
7713			int _klen;
7714			unsigned int _trans;
7715			const char *_acts;
7716			unsigned int _nacts;
7717			const char *_keys;
7718
7719	0	0	if ( p == ( (form.c_str() + form.size())) )
7720			goto _test_eof;
7721			if ( cs == 0 )
7722			goto _out;
7723			_resume:
7724	0		_keys = _JJS_RBS_trans_keys + _JJS_RBS_key_offsets[cs];
7725	0		_trans = _JJS_RBS_index_offsets[cs];
7726
7727	0		_klen = _JJS_RBS_single_lengths[cs];
7728	0	0	if ( _klen > 0 ) {
7729			const char *_lower = _keys;
7730			const char *_mid;
7731	0		const char *_upper = _keys + _klen - 1;
7732			while (1) {
7733	0	0	if ( _upper < _lower )
7734			break;
7735
7736	0		_mid = _lower + ((_upper-_lower) >> 1);
7737	0	0	if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid )
7738	0		_upper = _mid - 1;
7739	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid )
7740	0		_lower = _mid + 1;
7741			else {
7742	0		_trans += (unsigned int)(_mid - _keys);
7743	0		goto _match;
7744			}
7745			}
7746	0		_keys += _klen;
7747	0		_trans += _klen;
7748			}
7749
7750	0		_klen = _JJS_RBS_range_lengths[cs];
7751	0	0	if ( _klen > 0 ) {
7752			const char *_lower = _keys;
7753			const char *_mid;
7754	0		const char *_upper = _keys + (_klen<<1) - 2;
7755			while (1) {
7756	0	0	if ( _upper < _lower )
7757			break;
7758
7759	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
7760	0	0	if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] )
7761	0		_upper = _mid - 2;
7762	0	0	else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] )
7763	0		_lower = _mid + 2;
7764			else {
7765	0		_trans += (unsigned int)((_mid - _keys)>>1);
7766	0		goto _match;
7767			}
7768			}
7769	0		_trans += _klen;
7770			}
7771
7772			_match:
7773	0		_trans = _JJS_RBS_indicies[_trans];
7774	0		cs = _JJS_RBS_trans_targs[_trans];
7775
7776	0	0	if ( _JJS_RBS_trans_actions[_trans] == 0 )
7777			goto _again;
7778
7779	0		_acts = _JJS_RBS_actions + _JJS_RBS_trans_actions[_trans];
7780	0		_nacts = (unsigned int) *_acts++;
7781	0	0	while ( _nacts-- > 0 )
7782			{
7783	0		switch ( *_acts++ )
7784			{
7785			case 0:
7786	0	0	{ if (best > 'a') best = 'a', remove = 3, append = nullptr; }
7787			break;
7788			case 1:
7789	0	0	{ if (best > 'b') best = 'b', remove = 4, append = nullptr; }
7790			break;
7791			case 2:
7792	0	0	{ if (best > 'c') best = 'c', remove = 4, append = "y"; }
7793			break;
7794			case 3:
7795	0	0	{ if (best > 'd') best = 'd', remove = 3, append = nullptr; }
7796			break;
7797			case 4:
7798	0	0	{ if (best > 'e') best = 'e', remove = 2, append = nullptr; }
7799			break;
7800			case 5:
7801	0	0	{ if (best > 'f') best = 'f', remove = 3, append = nullptr; }
7802			break;
7803			}
7804			}
7805
7806			_again:
7807	0	0	if ( cs == 0 )
7808			goto _out;
7809	0	0	if ( ++p != ( (form.c_str() + form.size())) )
7810			goto _resume;
7811			_test_eof: {}
7812			_out: {}
7813			}
7814
7815	0	0	add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas);
		0
		0
7816	0		}
7817
7818			} // namespace morphodita
7819
7820			/////////
7821			// File: morphodita/morpho/external_morpho.h
7822			/////////
7823
7824			// This file is part of MorphoDiTa .
7825			//
7826			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7827			// Mathematics and Physics, Charles University in Prague, Czech Republic.
7828			//
7829			// This Source Code Form is subject to the terms of the Mozilla Public
7830			// License, v. 2.0. If a copy of the MPL was not distributed with this
7831			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
7832
7833			namespace morphodita {
7834
7835	0		class external_morpho : public morpho {
7836			public:
7837	0		external_morpho(unsigned version) : version(version) {}
7838
7839			virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override;
7840			virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override;
7841			virtual int raw_lemma_len(string_piece lemma) const override;
7842			virtual int lemma_id_len(string_piece lemma) const override;
7843			virtual int raw_form_len(string_piece form) const override;
7844			virtual tokenizer* new_tokenizer() const override;
7845
7846			bool load(istream& is);
7847
7848			private:
7849			unsigned version;
7850
7851			string unknown_tag;
7852			};
7853
7854			} // namespace morphodita
7855
7856			/////////
7857			// File: morphodita/tokenizer/generic_tokenizer.h
7858			/////////
7859
7860			// This file is part of MorphoDiTa .
7861			//
7862			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7863			// Mathematics and Physics, Charles University in Prague, Czech Republic.
7864			//
7865			// This Source Code Form is subject to the terms of the Mozilla Public
7866			// License, v. 2.0. If a copy of the MPL was not distributed with this
7867			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
7868
7869			namespace morphodita {
7870
7871	0		class generic_tokenizer : public ragel_tokenizer {
7872			public:
7873			enum { LATEST = 2 };
7874			generic_tokenizer(unsigned version);
7875
7876			virtual bool next_sentence(vector& tokens) override;
7877			};
7878
7879			} // namespace morphodita
7880
7881			/////////
7882			// File: morphodita/morpho/external_morpho.cpp
7883			/////////
7884
7885			// This file is part of MorphoDiTa .
7886			//
7887			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
7888			// Mathematics and Physics, Charles University in Prague, Czech Republic.
7889			//
7890			// This Source Code Form is subject to the terms of the Mozilla Public
7891			// License, v. 2.0. If a copy of the MPL was not distributed with this
7892			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
7893
7894			namespace morphodita {
7895
7896	0		bool external_morpho::load(istream& is) {
7897			binary_decoder data;
7898	0	0	if (!compressor::load(is, data)) return false;
		0
7899
7900			try {
7901			// Load unknown_tag
7902	0	0	unsigned length = data.next_1B();
7903	0	0	unknown_tag.assign(data.next(length), length);
		0
7904			} catch (binary_decoder_error&) {
7905			return false;
7906			}
7907
7908	0		return data.is_end();
7909			}
7910
7911	0		int external_morpho::analyze(string_piece form, guesser_mode /guesser/, vector& lemmas) const {
7912			lemmas.clear();
7913
7914	0	0	if (form.len) {
7915			// Start by skipping the first form
7916			string_piece lemmatags = form;
7917	0	0	while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
		0
7918	0	0	if (lemmatags.len) lemmatags.len--, lemmatags.str++;
7919
7920			// Split lemmatags using ' ' into lemma-tag pairs.
7921	0	0	while (lemmatags.len) {
7922			auto lemma_start = lemmatags.str;
7923	0	0	while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
		0
7924	0	0	if (!lemmatags.len) break;
7925			auto lemma_len = lemmatags.str - lemma_start;
7926	0		lemmatags.len--, lemmatags.str++;
7927
7928			auto tag_start = lemmatags.str;
7929	0	0	while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++;
		0
7930			auto tag_len = lemmatags.str - tag_start;
7931	0	0	if (lemmatags.len) lemmatags.len--, lemmatags.str++;
7932
7933	0	0	lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len));
7934			}
7935
7936	0	0	if (!lemmas.empty()) return NO_GUESSER;
7937			}
7938
7939	0	0	lemmas.emplace_back(string(form.str, form.len), unknown_tag);
7940	0		return -1;
7941			}
7942
7943	0		int external_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /guesser/, vector& forms) const {
7944			forms.clear();
7945
7946	0		tag_filter filter(tag_wildcard);
7947
7948	0	0	if (lemma.len) {
7949			// Start by locating the lemma
7950			string_piece formtags = lemma;
7951	0	0	while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
		0
7952	0		string_piece real_lemma(lemma.str, lemma.len - formtags.len);
7953	0	0	if (formtags.len) formtags.len--, formtags.str++;
7954
7955			// Split formtags using ' ' into form-tag pairs.
7956			bool any_result = false;
7957	0	0	while (formtags.len) {
7958			auto form_start = formtags.str;
7959	0	0	while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
		0
7960	0	0	if (!formtags.len) break;
7961			auto form_len = formtags.str - form_start;
7962	0		formtags.len--, formtags.str++;
7963
7964			auto tag_start = formtags.str;
7965	0	0	while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++;
		0
7966			auto tag_len = formtags.str - tag_start;
7967	0	0	if (formtags.len) formtags.len--, formtags.str++;
7968
7969			any_result = true;
7970			string tag(tag_start, tag_len);
7971	0	0	if (filter.matches(tag.c_str())) {
7972	0	0	if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len));
		0
7973	0	0	forms.back().forms.emplace_back(string(form_start, form_len), tag);
7974			}
7975			}
7976
7977	0	0	if (any_result) return NO_GUESSER;
7978			}
7979
7980			return -1;
7981			}
7982
7983	0		int external_morpho::raw_lemma_len(string_piece lemma) const {
7984			unsigned lemma_len = 0;
7985	0	0	while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
		0
7986	0		return lemma_len;
7987			}
7988
7989	0		int external_morpho::lemma_id_len(string_piece lemma) const {
7990			unsigned lemma_len = 0;
7991	0	0	while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++;
		0
7992	0		return lemma_len;
7993			}
7994
7995	0		int external_morpho::raw_form_len(string_piece form) const {
7996			unsigned form_len = 0;
7997	0	0	while (form_len < form.len && form.str[form_len] != ' ') form_len++;
		0
7998	0		return form_len;
7999			}
8000
8001	0		tokenizer* external_morpho::new_tokenizer() const {
8002	0		return new generic_tokenizer(version);
8003			}
8004
8005			} // namespace morphodita
8006
8007			/////////
8008			// File: morphodita/morpho/generic_lemma_addinfo.h
8009			/////////
8010
8011			// This file is part of MorphoDiTa .
8012			//
8013			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8014			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8015			//
8016			// This Source Code Form is subject to the terms of the Mozilla Public
8017			// License, v. 2.0. If a copy of the MPL was not distributed with this
8018			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8019
8020			namespace morphodita {
8021
8022			// Declarations
8023	0		struct generic_lemma_addinfo {
8024			inline static int raw_lemma_len(string_piece lemma);
8025			inline static int lemma_id_len(string_piece lemma);
8026			inline static string format(const unsigned char* addinfo, int addinfo_len);
8027			inline static bool generatable(const unsigned char* addinfo, int addinfo_len);
8028
8029			inline int parse(string_piece lemma, bool die_on_failure = false);
8030			inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len);
8031
8032			vector data;
8033			};
8034
8035			// Definitions
8036			int generic_lemma_addinfo::raw_lemma_len(string_piece lemma) {
8037	0		return lemma.len;
8038			}
8039
8040			int generic_lemma_addinfo::lemma_id_len(string_piece lemma) {
8041	0		return lemma.len;
8042			}
8043
8044			string generic_lemma_addinfo::format(const unsigned char* /addinfo/, int /addinfo_len/) {
8045			return string();
8046			}
8047
8048			bool generic_lemma_addinfo::generatable(const unsigned char* /addinfo/, int /addinfo_len/) {
8049			return true;
8050			}
8051
8052			int generic_lemma_addinfo::parse(string_piece lemma, bool /die_on_failure/) {
8053	0		return lemma.len;
8054			}
8055
8056			bool generic_lemma_addinfo::match_lemma_id(const unsigned char* /other_addinfo/, int /other_addinfo_len/) {
8057			return true;
8058			}
8059
8060			} // namespace morphodita
8061
8062			/////////
8063			// File: morphodita/morpho/generic_morpho.h
8064			/////////
8065
8066			// This file is part of MorphoDiTa .
8067			//
8068			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8069			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8070			//
8071			// This Source Code Form is subject to the terms of the Mozilla Public
8072			// License, v. 2.0. If a copy of the MPL was not distributed with this
8073			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8074
8075			namespace morphodita {
8076
8077	4		class generic_morpho : public morpho {
8078			public:
8079	1		generic_morpho(unsigned version) : version(version) {}
8080
8081			virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override;
8082			virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override;
8083			virtual int raw_lemma_len(string_piece lemma) const override;
8084			virtual int lemma_id_len(string_piece lemma) const override;
8085			virtual int raw_form_len(string_piece form) const override;
8086			virtual tokenizer* new_tokenizer() const override;
8087
8088			bool load(istream& is);
8089			private:
8090			inline void analyze_special(string_piece form, vector& lemmas) const;
8091
8092			unsigned version;
8093			morpho_dictionary dictionary;
8094			unique_ptr statistical_guesser;
8095
8096			string unknown_tag, number_tag, punctuation_tag, symbol_tag;
8097			};
8098
8099			} // namespace morphodita
8100
8101			/////////
8102			// File: morphodita/morpho/generic_morpho.cpp
8103			/////////
8104
8105			// This file is part of MorphoDiTa .
8106			//
8107			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8108			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8109			//
8110			// This Source Code Form is subject to the terms of the Mozilla Public
8111			// License, v. 2.0. If a copy of the MPL was not distributed with this
8112			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8113
8114			namespace morphodita {
8115
8116	1		bool generic_morpho::load(istream& is) {
8117			binary_decoder data;
8118	1	50	if (!compressor::load(is, data)) return false;
		50
8119
8120			try {
8121			// Load tags
8122	1	50	unsigned length = data.next_1B();
8123	1	50	unknown_tag.assign(data.next(length), length);
8124	1	50	length = data.next_1B();
8125	1	50	number_tag.assign(data.next(length), length);
8126	1	50	length = data.next_1B();
8127	1	50	punctuation_tag.assign(data.next(length), length);
8128	1	50	length = data.next_1B();
8129	1	50	symbol_tag.assign(data.next(length), length);
8130
8131			// Load dictionary
8132	1	50	dictionary.load(data);
8133
8134			// Optionally statistical guesser if present
8135			statistical_guesser.reset();
8136	1	50	if (data.next_1B()) {
		50
8137	1	50	statistical_guesser.reset(new morpho_statistical_guesser());
8138	1	50	statistical_guesser->load(data);
8139		0	}
8140			} catch (binary_decoder_error&) {
8141			return false;
8142			}
8143
8144	1		return data.is_end();
8145			}
8146
8147	7		int generic_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const {
8148			lemmas.clear();
8149
8150	7	50	if (form.len) {
8151			// Generate all casing variants if needed (they are different than given form).
8152			string form_uclc; // first uppercase, rest lowercase
8153			string form_lc; // all lowercase
8154	7	50	generate_casing_variants(form, form_uclc, form_lc);
8155
8156			// Start by analysing using the dictionary and all casing variants.
8157	7	50	dictionary.analyze(form, lemmas);
8158	7	50	if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas);
		0
8159	7	100	if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas);
		50
8160	7	50	if (!lemmas.empty()) return NO_GUESSER;
8161
8162			// Then call analyze_special to handle numbers, punctuation and symbols.
8163	0	0	analyze_special(form, lemmas);
8164	0	0	if (!lemmas.empty()) return NO_GUESSER;
8165
8166			// For the statistical guesser, use all casing variants.
8167	0	0	if (guesser == GUESSER && statistical_guesser) {
		0
		0
8168	0	0	if (form_uclc.empty() && form_lc.empty())
		0
		0
8169	0	0	statistical_guesser->analyze(form, lemmas, nullptr);
8170			else {
8171	0	0	morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3);
8172	0	0	statistical_guesser->analyze(form, lemmas, &used_rules);
8173	0	0	if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules);
		0
8174	0	0	if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules);
		0
8175			}
8176			}
8177	0	0	if (!lemmas.empty()) return GUESSER;
8178			}
8179
8180	0	0	lemmas.emplace_back(string(form.str, form.len), unknown_tag);
8181	7		return -1;
8182			}
8183
8184	0		int generic_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /guesser/, vector& forms) const {
8185			forms.clear();
8186
8187	0		tag_filter filter(tag_wildcard);
8188
8189	0	0	if (lemma.len) {
8190	0	0	if (dictionary.generate(lemma, filter, forms))
		0
8191			return NO_GUESSER;
8192			}
8193
8194			return -1;
8195			}
8196
8197	0		int generic_morpho::raw_lemma_len(string_piece lemma) const {
8198	0		return generic_lemma_addinfo::raw_lemma_len(lemma);
8199			}
8200
8201	0		int generic_morpho::lemma_id_len(string_piece lemma) const {
8202	0		return generic_lemma_addinfo::lemma_id_len(lemma);
8203			}
8204
8205	7		int generic_morpho::raw_form_len(string_piece form) const {
8206	7		return form.len;
8207			}
8208
8209	0		tokenizer* generic_morpho::new_tokenizer() const {
8210	0		return new generic_tokenizer(version);
8211			}
8212
8213	0		void generic_morpho::analyze_special(string_piece form, vector& lemmas) const {
8214			using namespace unilib;
8215
8216			// Analyzer for numbers, punctuation and symbols.
8217			// Number is anything matching [+-]? is_Pn* ([.,] is_Pn)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn nonempty.
8218			// Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character.
8219			// Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number.
8220	0	0	if (!form.len) return;
8221
8222	0		string_piece number = form;
8223	0		char32_t first = utf8::decode(number.str, number.len);
8224
8225			// Try matching a number.
8226			char32_t codepoint = first;
8227			bool any_digit = false;
8228	0	0	if (codepoint == '+' \|\| codepoint == '-') codepoint = utf8::decode(number.str, number.len);
8229	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
8230	0	0	if ((codepoint == '.' && number.len) \|\| codepoint == ',') codepoint = utf8::decode(number.str, number.len);
		0
		0
8231	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
8232	0	0	if (any_digit && (codepoint == 'e' \|\| codepoint == 'E')) {
		0
8233	0		codepoint = utf8::decode(number.str, number.len);
8234	0	0	if (codepoint == '+' \|\| codepoint == '-') codepoint = utf8::decode(number.str, number.len);
8235			any_digit = false;
8236	0	0	while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len);
8237			}
8238
8239	0	0	if (any_digit && !number.len && (!codepoint \|\| codepoint == '.')) {
		0
		0
8240	0	0	lemmas.emplace_back(string(form.str, form.len), number_tag);
8241	0		return;
8242			}
8243
8244			// Try matching punctuation or symbol.
8245			bool punctuation = true, symbol = true;
8246	0		string_piece form_ori = form;
8247	0	0	while (form.len) {
8248	0		codepoint = utf8::decode(form.str, form.len);
8249	0	0	punctuation = punctuation && unicode::category(codepoint) & unicode::P;
		0
8250	0	0	symbol = symbol && unicode::category(codepoint) & unicode::S;
		0
8251			}
8252	0	0	if (punctuation)
8253	0	0	lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag);
8254	0	0	else if (symbol)
8255	0	0	lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag);
8256			}
8257
8258			} // namespace morphodita
8259
8260			/////////
8261			// File: morphodita/morpho/generic_morpho_encoder.h
8262			/////////
8263
8264			// This file is part of MorphoDiTa .
8265			//
8266			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8267			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8268			//
8269			// This Source Code Form is subject to the terms of the Mozilla Public
8270			// License, v. 2.0. If a copy of the MPL was not distributed with this
8271			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8272
8273			namespace morphodita {
8274
8275			class generic_morpho_encoder {
8276			public:
8277	0		struct tags {
8278			string unknown_tag, number_tag, punctuation_tag, symbol_tag;
8279			};
8280			static void encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho);
8281			};
8282
8283			} // namespace morphodita
8284
8285			/////////
8286			// File: morphodita/morpho/persistent_unordered_map_encoder.h
8287			/////////
8288
8289			// This file is part of MorphoDiTa .
8290			//
8291			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8292			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8293			//
8294			// This Source Code Form is subject to the terms of the Mozilla Public
8295			// License, v. 2.0. If a copy of the MPL was not distributed with this
8296			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8297
8298			namespace morphodita {
8299
8300			template
8301	0		persistent_unordered_map::persistent_unordered_map(const unordered_map& map, double load_factor, EntryEncode entry_encode) {
8302	0	0	construct(std::map(map.begin(), map.end()), load_factor, entry_encode);
		0
		0
		0
8303	0		}
8304
8305			template
8306	0		persistent_unordered_map::persistent_unordered_map(const unordered_map& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode) {
8307			// Copy data, possibly including prefixes and suffixes
8308	0	0	std::map enlarged_map(map.begin(), map.end());
		0
8309
8310	0	0	for (auto&& entry : map) {
		0
8311	0		const string& key = entry.first;
8312
8313	0	0	if (!key.empty() && add_prefixes)
		0
		0
		0
		0
		0
8314	0	0	for (unsigned i = key.size() - 1; i; i--)
		0
8315	0	0	enlarged_map[key.substr(0, i)];
		0
		0
		0
8316
8317	0	0	if (!key.empty() && add_suffixes)
		0
		0
		0
		0
		0
8318	0	0	for (unsigned i = 1; i < key.size(); i++)
		0
8319	0	0	enlarged_map[key.substr(i)];
		0
		0
		0
8320			}
8321
8322	0	0	construct(enlarged_map, load_factor, entry_encode);
		0
8323	0		}
8324
8325			// We could (and used to) use unordered_map as input parameter.
8326			// Nevertheless, as order is unspecified, the resulting persistent_unordered_map
8327			// has different collision chains when generated on 32-bit and 64-bit machines.
8328			// To guarantee uniform binary representation, we use map instead.
8329			template
8330	0		void persistent_unordered_map::construct(const map& map, double load_factor, EntryEncode entry_encode) {
8331			// 1) Count number of elements for each size
8332			vector sizes;
8333	0	0	for (auto&& elem : map) {
		0
		0
		0
8334	0		unsigned len = elem.first.size();
8335	0	0	if (len >= sizes.size()) sizes.resize(len + 1);
		0
		0
		0
		0
		0
		0
		0
8336	0		sizes[len]++;
8337			}
8338	0	0	for (auto&& size : sizes)
		0
		0
		0
8339	0	0	resize(unsigned(load_factor * size));
		0
		0
		0
8340
8341			// 2) Add sizes of element data
8342	0	0	for (auto&& elem : map) {
		0
		0
		0
8343	0	0	binary_encoder enc;
		0
		0
		0
8344	0	0	entry_encode(enc, elem.second);
		0
		0
		0
8345	0		add(elem.first.c_str(), elem.first.size(), enc.data.size());
8346			}
8347	0	0	done_adding();
		0
		0
		0
8348
8349			// 3) Fill in element data
8350	0	0	for (auto&& elem : map) {
		0
		0
		0
8351	0	0	binary_encoder enc;
		0
		0
		0
8352	0	0	entry_encode(enc, elem.second);
		0
		0
		0
8353	0		small_memcpy(fill(elem.first.c_str(), elem.first.size(), enc.data.size()), enc.data.data(), enc.data.size());
8354			}
8355	0		done_filling();
8356	0		}
8357
8358	0		void persistent_unordered_map::save(binary_encoder& enc) {
8359	0		enc.add_1B(hashes.size());
8360
8361	0	0	for (auto&& hash : hashes)
8362	0		hash.save(enc);
8363	0		}
8364
8365	0		void persistent_unordered_map::fnv_hash::save(binary_encoder& enc) {
8366	0		enc.add_4B(hash.size());
8367			enc.add_data(hash);
8368
8369	0		enc.add_4B(data.size());
8370			enc.add_data(data);
8371	0		}
8372
8373			} // namespace morphodita
8374
8375			/////////
8376			// File: morphodita/morpho/raw_morpho_dictionary_reader.h
8377			/////////
8378
8379			// This file is part of MorphoDiTa .
8380			//
8381			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8382			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8383			//
8384			// This Source Code Form is subject to the terms of the Mozilla Public
8385			// License, v. 2.0. If a copy of the MPL was not distributed with this
8386			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8387
8388			namespace morphodita {
8389
8390	0		class raw_morpho_dictionary_reader {
8391			public:
8392	0		raw_morpho_dictionary_reader(istream& in) : in(in) {}
8393			bool next_lemma(string& lemma, vector>& tagged_forms);
8394			private:
8395			istream& in;
8396			string line;
8397			vector tokens;
8398			unordered_set seen_lemmas;
8399			};
8400
8401			} // namespace morphodita
8402
8403			/////////
8404			// File: utils/new_unique_ptr.h
8405			/////////
8406
8407			// This file is part of UFAL C++ Utils .
8408			//
8409			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8410			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8411			//
8412			// This Source Code Form is subject to the terms of the Mozilla Public
8413			// License, v. 2.0. If a copy of the MPL was not distributed with this
8414			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8415
8416			namespace utils {
8417
8418			template
8419	3		unique_ptr new_unique_ptr(Args&&... args) {
8420	3	50	return unique_ptr(new T(std::forward(args)...));
		0
		0
8421			}
8422
8423			} // namespace utils
8424
8425			/////////
8426			// File: morphodita/morpho/morpho_dictionary_encoder.h
8427			/////////
8428
8429			// This file is part of MorphoDiTa .
8430			//
8431			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8432			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8433			//
8434			// This Source Code Form is subject to the terms of the Mozilla Public
8435			// License, v. 2.0. If a copy of the MPL was not distributed with this
8436			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8437
8438			namespace morphodita {
8439
8440			// Declarations
8441			template
8442			class morpho_dictionary_encoder {
8443			public:
8444			static void encode(istream& is, int max_suffix_len, binary_encoder& enc);
8445			};
8446
8447			// Definitions
8448			template
8449	0		class dictionary {
8450			public:
8451			void load(istream& is, int max_suffix_len);
8452			void encode(binary_encoder& enc);
8453
8454			private:
8455	0		class trie {
8456			public:
8457	0		trie() : depth(0) {}
8458
8459	0		void add(const char* str) {
8460	0	0	if (!*str) return;
8461
8462	0	0	for (auto&& child : children)
8463	0	0	if (child.first == *str) {
8464	0		child.second->add(str + 1);
8465	0		depth = max(depth, 1 + child.second->depth);
8466			return;
8467			}
8468	0	0	children.emplace_back(*str, new_unique_ptr());
8469	0		children.back().second->add(str + 1);
8470	0		depth = max(depth, 1 + children.back().second->depth);
8471			}
8472
8473	0		string find_candidate_prefix(int max_suffix_len) {
8474			string current, best;
8475	0		int best_length = 0;
8476	0	0	find_candidate_prefix(max_suffix_len, current, best, best_length, 0);
8477	0		return best;
8478			}
8479	0		void find_candidate_prefix(int max_suffix_len, string& current, string& best, int& best_length, int length) {
8480	0	0	if (depth < max_suffix_len && length > best_length) {
		0
8481			best = current;
8482	0		best_length = length;
8483			}
8484	0	0	for (auto&& child : children) {
8485	0		current.push_back(child.first);
8486	0	0	child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1);
8487	0		current.resize(current.size() - 1);
8488			}
8489	0		}
8490
8491			vector>> children;
8492			int depth;
8493			};
8494
8495	0		class histogram {
8496			public:
8497	0		void add(const string& str) {
8498	0	0	if (str.size() >= lengths.size()) lengths.resize(str.size() + 1);
8499			lengths[str.size()].insert(str);
8500	0		}
8501
8502	0		void encode(binary_encoder& enc) {
8503	0		enc.add_1B(lengths.size());
8504	0	0	for (auto&& set : lengths)
8505	0		enc.add_4B(set.size());
8506	0		}
8507
8508			vector> lengths;
8509			};
8510
8511	0		struct lemma_info {
8512	0		lemma_info(string lemma) {
8513	0	0	this->lemma = lemma.substr(0, addinfo.parse(lemma, true));
8514	0		}
8515
8516			string lemma;
8517			LemmaAddinfo addinfo;
8518	0		struct lemma_form_info {
8519	0		lemma_form_info(string form, int clas) : form(form), clas(clas) {}
8520
8521			string form;
8522			int clas;
8523
8524	0	0	bool operator<(const lemma_form_info& other) const { return form < other.form \|\| (form == other.form && clas < other.clas); }
		0
		0
8525			};
8526			vector forms;
8527
8528	0	0	bool operator<(const lemma_info& other) const { return lemma < other.lemma \|\| (lemma == other.lemma && addinfo.data < other.addinfo.data); }
8529			};
8530
8531			unordered_map classes;
8532			unordered_map>> suffixes;
8533
8534			vector tags;
8535			unordered_map tags_map;
8536
8537			histogram lemmas_hist, forms_hist;
8538
8539			vector lemmas;
8540			};
8541
8542			template
8543	0		void morpho_dictionary_encoder::encode(istream& is, int max_suffix_len, binary_encoder& enc) {
8544	0		dictionary dict;
8545
8546			// Load the dictionary and create classes
8547	0	0	dict.load(is, max_suffix_len);
8548
8549			// Encode the dictionary
8550	0	0	dict.encode(enc);
8551	0		}
8552
8553			template
8554	0		void dictionary::load(istream& is, int max_suffix_len) {
8555			// Load lemmas and create classes
8556	0		raw_morpho_dictionary_reader raw(is);
8557			string lemma;
8558	0		vector> forms;
8559	0	0	while(raw.next_lemma(lemma, forms)) {
		0
8560			// Make sure forms are unique
8561			sort(forms.begin(), forms.end());
8562			auto forms_end = unique(forms.begin(), forms.end());
8563	0	0	if (forms_end != forms.end()) {
8564			// cerr << "Warning: repeated form-tag in lemma " << lemma << '.' << endl;
8565			forms.erase(forms_end, forms.end());
8566			}
8567
8568			// Create lemma_info
8569	0	0	lemmas.emplace_back(lemma);
8570			auto& lemma_info = lemmas.back();
8571	0	0	lemmas_hist.add(lemma_info.lemma);
8572
8573			// Create classes
8574	0	0	while (!forms.empty()) {
8575			trie t;
8576	0	0	for (auto&& form : forms)
8577	0	0	t.add(form.first.c_str());
8578
8579			// Find prefix of forms in class being added.
8580	0	0	string prefix = t.find_candidate_prefix(max_suffix_len);
8581
8582			// Find forms of the class being added.
8583			auto start = forms.begin();
8584	0	0	while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++;
		0
		0
		0
8585	0	0	if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!");
		0
		0
8586			auto end = start;
8587	0	0	while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++;
		0
		0
		0
8588
8589			// Find common prefix of class forms -- may be larger than prefix.
8590	0		int common_prefix = prefix.size();
8591	0	0	while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++;
		0
		0
8592
8593			string clas;
8594	0	0	for (auto form = start; form != end; form++) {
8595	0	0	if (!clas.empty()) clas.push_back('\t');
		0
8596	0	0	clas.append(form->first, common_prefix, string::npos);
8597	0	0	clas.push_back('\t');
8598			clas.append(form->second);
8599			}
8600
8601	0		auto class_it = classes.emplace(clas, int(classes.size()));
8602	0		int class_id = class_it.first->second;
8603	0	0	if (class_it.second) {
8604			// New class, add it, together with its tags.
8605	0	0	for (auto form = start; form != end; form++) {
8606	0		int tag = tags_map.emplace(form->second, int(tags.size())).first->second;
8607	0	0	if (tag >= int(tags.size())) tags.emplace_back(form->second);
		0
8608	0	0	suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag);
		0
		0
8609			}
8610			}
8611
8612			// Move forms in the class being added to lemma and remove them from unprocessed forms.
8613	0	0	lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id);
		0
8614	0	0	forms_hist.add(lemma_info.forms.back().form);
8615			forms.erase(start, end);
8616			}
8617			stable_sort(lemma_info.forms.begin(), lemma_info.forms.end());
8618			}
8619			stable_sort(lemmas.begin(), lemmas.end());
8620	0		}
8621
8622			template
8623	0		void dictionary::encode(binary_encoder& enc) {
8624			// Encode lemmas and forms
8625	0		lemmas_hist.encode(enc);
8626	0		forms_hist.encode(enc);
8627
8628	0		string prev = "";
8629	0		enc.add_4B(lemmas.size());
8630	0	0	for (auto&& lemma : lemmas) {
8631			int cpl = 0;
8632	0	0	while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++;
		0
		0
8633
8634	0	0	enc.add_1B(prev.length() - cpl);
8635	0	0	enc.add_1B(lemma.lemma.size() - cpl);
8636	0	0	enc.add_data(lemma.lemma.substr(cpl));
8637	0	0	enc.add_1B(lemma.addinfo.data.size());
8638			enc.add_data(lemma.addinfo.data);
8639	0	0	enc.add_1B(lemma.forms.size());
8640
8641			string prev_form = lemma.lemma;
8642	0	0	for (auto&& lemma_form : lemma.forms) {
8643			unsigned best_prev_from = 0, best_form_from = 0, best_len = 0;
8644	0	0	for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++)
8645	0	0	for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) {
8646			unsigned len = 0;
8647	0	0	while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++;
		0
		0
		0
8648	0	0	if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len;
8649			}
8650
8651			enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
8652	0	0	enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
		0
		0
		0
8653			ADD_START * (best_form_from>0) + ADD_END * (best_form_from+best_len
8654	0	0	if (best_prev_from > 0) enc.add_1B(best_prev_from);
		0
8655	0	0	if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len);
		0
8656	0	0	if (best_form_from > 0) {
8657	0	0	enc.add_1B(best_form_from);
8658	0	0	enc.add_data(lemma_form.form.substr(0, best_form_from));
8659			}
8660	0	0	if (best_form_from + best_len < lemma_form.form.size()) {
8661	0	0	enc.add_1B(lemma_form.form.size() - best_form_from - best_len);
8662	0	0	enc.add_data(lemma_form.form.substr(best_form_from + best_len));
8663			}
8664	0	0	enc.add_2B(lemma_form.clas);
8665
8666	0		prev_form = lemma_form.form;
8667			}
8668
8669			prev = lemma.lemma;
8670			}
8671
8672			// Encode tags
8673	0	0	enc.add_2B(tags.size());
8674	0	0	for (auto&& tag : tags) {
8675	0	0	enc.add_1B(tag.size());
8676			enc.add_data(tag);
8677			}
8678
8679			// Encode classes
8680	0	0	persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map>& suffix) {
8681	0		enc.add_2B(suffix.size());
8682	0	0	for (auto&& clas : suffix)
8683	0		enc.add_2B(clas.first);
8684			uint32_t tags = 0, prev_tags = 0;
8685	0	0	for (auto&& clas : suffix) {
8686	0	0	enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags);
8687			prev_tags = tags;
8688	0		tags += clas.second.size();
8689			}
8690	0	0	enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags);
8691	0	0	for (auto&& clas : suffix)
8692	0	0	for (auto&& tag : clas.second)
8693	0		enc.add_2B(tag);
8694	0	0	}).save(enc);
8695	0		}
8696
8697			} // namespace morphodita
8698
8699			/////////
8700			// File: morphodita/morpho/morpho_prefix_guesser_encoder.h
8701			/////////
8702
8703			// This file is part of MorphoDiTa .
8704			//
8705			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8706			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8707			//
8708			// This Source Code Form is subject to the terms of the Mozilla Public
8709			// License, v. 2.0. If a copy of the MPL was not distributed with this
8710			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8711
8712			namespace morphodita {
8713
8714			class morpho_prefix_guesser_encoder {
8715			public:
8716			static void encode(istream& is, binary_encoder& enc);
8717			};
8718
8719			} // namespace morphodita
8720
8721			/////////
8722			// File: morphodita/morpho/morpho_statistical_guesser_encoder.h
8723			/////////
8724
8725			// This file is part of MorphoDiTa .
8726			//
8727			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8728			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8729			//
8730			// This Source Code Form is subject to the terms of the Mozilla Public
8731			// License, v. 2.0. If a copy of the MPL was not distributed with this
8732			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8733
8734			namespace morphodita {
8735
8736			class morpho_statistical_guesser_encoder {
8737			public:
8738			static void encode(istream& is, binary_encoder& enc);
8739			};
8740
8741			} // namespace morphodita
8742
8743			/////////
8744			// File: morphodita/morpho/generic_morpho_encoder.cpp
8745			/////////
8746
8747			// This file is part of MorphoDiTa .
8748			//
8749			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8750			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8751			//
8752			// This Source Code Form is subject to the terms of the Mozilla Public
8753			// License, v. 2.0. If a copy of the MPL was not distributed with this
8754			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8755
8756			namespace morphodita {
8757
8758	0		void generic_morpho_encoder::encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho) {
8759	0		binary_encoder enc;
8760
8761	0	0	enc.add_1B(tags.unknown_tag.size());
8762			enc.add_data(tags.unknown_tag);
8763	0	0	enc.add_1B(tags.number_tag.size());
8764			enc.add_data(tags.number_tag);
8765	0	0	enc.add_1B(tags.punctuation_tag.size());
8766			enc.add_data(tags.punctuation_tag);
8767	0	0	enc.add_1B(tags.symbol_tag.size());
8768			enc.add_data(tags.symbol_tag);
8769
8770			// cerr << "Encoding dictionary." << endl;
8771	0	0	morpho_dictionary_encoder::encode(in_dictionary, max_suffix_len, enc);
8772
8773			// Load and encode statistical guesser if requested
8774	0	0	enc.add_1B(bool(in_statistical_guesser));
8775	0	0	if (in_statistical_guesser) {
8776			// cerr << "Encoding statistical guesser." << endl;
8777	0	0	morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc);
8778			}
8779
8780			// done, save the dictionary
8781			// cerr << "Compressing dictionary." << endl;
8782	0	0	if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!");
		0
		0
		0
8783			// cerr << "Dictionary saved." << endl;
8784	0		}
8785
8786			} // namespace morphodita
8787
8788			/////////
8789			// File: morphodita/morpho/morpho_ids.h
8790			/////////
8791
8792			// This file is part of MorphoDiTa .
8793			//
8794			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8795			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8796			//
8797			// This Source Code Form is subject to the terms of the Mozilla Public
8798			// License, v. 2.0. If a copy of the MPL was not distributed with this
8799			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8800
8801			namespace morphodita {
8802
8803			class morpho_ids {
8804			public:
8805			enum morpho_id {
8806			CZECH = 0,
8807			ENGLISH_V1 = 1,
8808			GENERIC = 2,
8809			EXTERNAL = 3,
8810			ENGLISH_V2 = 4,
8811			ENGLISH_V3 = 5, ENGLISH = ENGLISH_V3,
8812			SLOVAK_PDT = 6,
8813			DERIVATOR_DICTIONARY = 7,
8814			};
8815
8816			static bool parse(const string& str, morpho_id& id) {
8817			if (str == "czech") return id = CZECH, true;
8818			if (str == "english") return id = ENGLISH, true;
8819			if (str == "external") return id = EXTERNAL, true;
8820			if (str == "generic") return id = GENERIC, true;
8821			if (str == "slovak_pdt") return id = SLOVAK_PDT, true;
8822			return false;
8823			}
8824			};
8825
8826			typedef morpho_ids::morpho_id morpho_id;
8827
8828			} // namespace morphodita
8829
8830			/////////
8831			// File: morphodita/morpho/morpho.cpp
8832			/////////
8833
8834			// This file is part of MorphoDiTa .
8835			//
8836			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8837			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8838			//
8839			// This Source Code Form is subject to the terms of the Mozilla Public
8840			// License, v. 2.0. If a copy of the MPL was not distributed with this
8841			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8842
8843			namespace morphodita {
8844
8845	1		morpho* morpho::load(istream& is) {
8846	1		morpho_id id = morpho_id(is.get());
8847	1		switch (id) {
8848			case morpho_ids::CZECH:
8849			{
8850	0		auto res = new_unique_ptr(czech_morpho::morpho_language::CZECH, 1);
8851	0	0	if (res->load(is)) return res.release();
		0
8852			break;
8853			}
8854			case morpho_ids::ENGLISH_V1:
8855			case morpho_ids::ENGLISH_V2:
8856			case morpho_ids::ENGLISH_V3:
8857			{
8858			auto res = new_unique_ptr(id == morpho_ids::ENGLISH_V1 ? 1 :
8859			id == morpho_ids::ENGLISH_V2 ? 2 :
8860	0	0	3);
		0
8861	0	0	if (res->load(is)) return res.release();
		0
8862			break;
8863			}
8864			case morpho_ids::EXTERNAL:
8865			{
8866	0		auto res = new_unique_ptr(1);
8867	0	0	if (res->load(is)) return res.release();
		0
8868			break;
8869			}
8870			case morpho_ids::GENERIC:
8871			{
8872	1		auto res = new_unique_ptr(1);
8873	1	50	if (res->load(is)) return res.release();
		50
8874			break;
8875			}
8876			case morpho_ids::SLOVAK_PDT:
8877			{
8878	0		auto res = new_unique_ptr(czech_morpho::morpho_language::SLOVAK, 3);
8879	0	0	if (res->load(is)) return res.release();
		0
8880			break;
8881			}
8882			case morpho_ids::DERIVATOR_DICTIONARY:
8883			{
8884	0		auto derinet = new_unique_ptr();
8885	0	0	if (!derinet->load(is)) return nullptr;
		0
8886
8887	0	0	unique_ptr dictionary(load(is));
8888	0	0	if (!dictionary) return nullptr;
8889	0		derinet->dictionary = dictionary.get();
8890			dictionary->derinet.reset(derinet.release());
8891	0		return dictionary.release();
8892			}
8893			}
8894
8895			return nullptr;
8896			}
8897
8898	0		morpho* morpho::load(const char* fname) {
8899	0	0	ifstream f(path_from_utf8(fname).c_str(), ifstream::binary);
8900	0	0	if (!f) return nullptr;
8901
8902	0	0	return load(f);
8903			}
8904
8905	0		const derivator* morpho::get_derivator() const {
8906	0		return derinet.get();
8907			}
8908
8909			} // namespace morphodita
8910
8911			/////////
8912			// File: morphodita/morpho/morpho_statistical_guesser.cpp
8913			/////////
8914
8915			// This file is part of MorphoDiTa .
8916			//
8917			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
8918			// Mathematics and Physics, Charles University in Prague, Czech Republic.
8919			//
8920			// This Source Code Form is subject to the terms of the Mozilla Public
8921			// License, v. 2.0. If a copy of the MPL was not distributed with this
8922			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
8923
8924			namespace morphodita {
8925
8926	1		void morpho_statistical_guesser::load(binary_decoder& data) {
8927			// Load tags and default tag
8928	1		tags.resize(data.next_2B());
8929	7	100	for (auto&& tag : tags) {
8930	6		tag.resize(data.next_1B());
8931	403	100	for (unsigned i = 0; i < tag.size(); i++)
8932	397		tag[i] = data.next_1B();
8933			}
8934	1		default_tag = data.next_2B();
8935
8936			// Load rules
8937	1		rules.load(data);
8938	1		}
8939
8940			// Helper method for analyze.
8941	0		static bool contains(morpho_statistical_guesser::used_rules* used, const string& rule) {
8942	0	0	if (!used) return false;
8943
8944	0	0	for (auto&& used_rule : *used)
8945	0	0	if (used_rule == rule)
8946			return true;
8947
8948			return false;
8949			}
8950
8951			// Produces unique lemma-tag pairs.
8952	0		void morpho_statistical_guesser::analyze(string_piece form, vector& lemmas, morpho_statistical_guesser::used_rules* used) {
8953			unsigned lemmas_initial_size = lemmas.size();
8954
8955			// We have rules in format "suffix prefix" in rules.
8956			// Find the matching rule with longest suffix and of those with longest prefix.
8957	0	0	string rule_label; rule_label.reserve(12);
8958			unsigned suffix_len = 0;
8959	0	0	for (; suffix_len < form.len; suffix_len++) {
8960	0	0	rule_label.push_back(form.str[form.len - (suffix_len + 1)]);
8961	0	0	if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); }))
8962			break;
8963			}
8964
8965	0	0	for (suffix_len++; suffix_len--; ) {
8966	0		rule_label.resize(suffix_len);
8967	0	0	rule_label.push_back(' ');
8968
8969			const unsigned char* rule = nullptr;
8970			unsigned rule_prefix_len = 0;
8971	0	0	for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) {
8972	0	0	if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]);
		0
8973	0		const unsigned char* found = rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); });
8974	0	0	if (!found) break;
8975	0	0	if (*(found += sizeof(uint16_t))) {
8976			rule = found;
8977			rule_prefix_len = prefix_len;
8978			}
8979			}
8980
8981	0	0	if (rule) {
8982	0		rule_label.resize(suffix_len + 1 + rule_prefix_len);
8983	0	0	if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' '
		0
		0
8984	0	0	if (used) used->push_back(rule_label);
		0
8985	0	0	for (int rules_len = *rule++; rules_len; rules_len--) {
8986	0		unsigned pref_del_len = rule++; const char pref_del = (const char*)rule; rule += pref_del_len;
8987	0		unsigned pref_add_len = rule++; const char pref_add = (const char*)rule; rule += pref_add_len;
8988	0		unsigned suff_del_len = rule++; const char suff_del = (const char*)rule; rule += suff_del_len;
8989	0		unsigned suff_add_len = rule++; const char suff_add = (const char*)rule; rule += suff_add_len;
8990	0		unsigned tags_len = rule++; const uint16_t tags = (const uint16_t)rule; rule += tags_len sizeof(uint16_t);
8991
8992	0	0	if (pref_del_len + suff_del_len > form.len \|\|
		0
8993	0	0	(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) \|\|
		0
8994	0	0	(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) \|\|
		0
		0
8995	0		(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len == 0))
8996	0		continue;
8997
8998			string lemma;
8999	0	0	lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len);
9000	0	0	if (pref_add_len) lemma.append(pref_add, pref_add_len);
		0
9001	0	0	if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len);
		0
9002	0	0	if (suff_add_len) lemma.append(suff_add, suff_add_len);
		0
9003	0	0	while (tags_len--)
9004	0	0	lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]);
9005			}
9006			}
9007			break;
9008			}
9009			}
9010
9011			// If nothing was found, use default tag.
9012	0	0	if (lemmas.size() == lemmas_initial_size)
9013	0	0	if (!contains(used, string())) {
9014	0	0	if (used) used->push_back(string());
9015	0	0	lemmas.emplace_back(string(form.str, form.len), tags[default_tag]);
9016			}
9017	0		}
9018
9019			} // namespace morphodita
9020
9021			/////////
9022			// File: utils/split.h
9023			/////////
9024
9025			// This file is part of UFAL C++ Utils .
9026			//
9027			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9028			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9029			//
9030			// This Source Code Form is subject to the terms of the Mozilla Public
9031			// License, v. 2.0. If a copy of the MPL was not distributed with this
9032			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9033
9034			namespace utils {
9035
9036			//
9037			// Declarations
9038			//
9039
9040			// Split given text on the separator character.
9041			inline void split(const string& text, char sep, vector& tokens);
9042			inline void split(string_piece text, char sep, vector& tokens);
9043
9044			//
9045			// Definitions
9046			//
9047
9048	0		void split(const string& text, char sep, vector& tokens) {
9049	0		tokens.clear();
9050	0	0	if (text.empty()) return;
9051
9052	0		string::size_type index = 0;
9053	0	0	for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1)
9054	0		tokens.emplace_back(text, index, next - index);
9055
9056	0		tokens.emplace_back(text, index);
9057			}
9058
9059	53		void split(string_piece text, char sep, vector& tokens) {
9060			tokens.clear();
9061	53	50	if (!text.len) return;
9062
9063	53		const char* str = text.str;
9064	121	100	for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1)
9065	68		tokens.emplace_back(str, next - str);
9066
9067	53		tokens.emplace_back(str, text.str + text.len - str);
9068			}
9069
9070			} // namespace utils
9071
9072			/////////
9073			// File: morphodita/morpho/morpho_statistical_guesser_encoder.cpp
9074			/////////
9075
9076			// This file is part of MorphoDiTa .
9077			//
9078			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9079			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9080			//
9081			// This Source Code Form is subject to the terms of the Mozilla Public
9082			// License, v. 2.0. If a copy of the MPL was not distributed with this
9083			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9084
9085			namespace morphodita {
9086
9087	0		void morpho_statistical_guesser_encoder::encode(istream& is, binary_encoder& enc) {
9088			unordered_map, vector>>> statistical_guesser;
9089	0		vector tags;
9090			unordered_map tags_map;
9091
9092			// Load statistical guesser
9093			string line;
9094	0		vector tokens;
9095	0	0	if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file");
		0
		0
		0
9096	0		int statistical_guesser_default = tags_map.emplace(line.data(), int(tags.size())).first->second;
9097	0	0	if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data());
		0
9098
9099	0	0	while (getline(is, line)) {
		0
9100	0	0	split(line, '\t', tokens);
9101	0	0	if (tokens.size() < 3 \|\| (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!");
		0
		0
		0
		0
9102
9103	0		vector affixes;
9104	0	0	split(tokens[0], ' ', affixes);
9105	0	0	if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!");
		0
		0
9106			reverse(affixes[1].begin(), affixes[1].end());
9107
9108	0	0	auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]];
		0
9109	0	0	for (unsigned i = 1; i < tokens.size(); i+= 2) {
9110	0		vector replacements;
9111	0	0	split(tokens[i], ' ', replacements);
9112	0	0	if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!");
		0
		0
9113
9114	0		vector rule_tags;
9115	0	0	split(tokens[i+1], ' ', rule_tags);
9116			vector decoded_tags;
9117	0	0	for (auto&& rule_tag : rule_tags) {
9118	0		int tag = tags_map.emplace(rule_tag, int(tags.size())).first->second;
9119	0	0	if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag);
		0
9120	0	0	decoded_tags.emplace_back(tag);
9121			}
9122
9123	0	0	rules.emplace_back(replacements, decoded_tags);
9124			}
9125			}
9126
9127			// Encode statistical guesser
9128	0	0	enc.add_2B(tags.size());
9129	0	0	for (auto&& tag : tags) {
9130	0	0	enc.add_1B(tag.size());
9131			enc.add_data(tag);
9132			}
9133	0	0	enc.add_2B(statistical_guesser_default);
9134
9135	0		persistent_unordered_map(statistical_guesser, 5, true, false, [](binary_encoder& enc, vector, vector>> rules) {
9136	0		binary_encoder e;
9137	0	0	e.add_1B(rules.size());
9138	0	0	for (auto&& rule : rules) {
9139	0	0	if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!");
		0
		0
9140	0	0	for (auto&& affix : rule.first) {
9141	0	0	e.add_1B(affix.size());
9142			e.add_data(affix);
9143			}
9144	0	0	e.add_1B(rule.second.size());
9145	0	0	for (auto&& tag : rule.second)
9146	0	0	e.add_2B(tag);
9147			}
9148	0	0	enc.add_2B(e.data.size());
9149			enc.add_data(e.data);
9150	0	0	}).save(enc);
		0
9151	0		}
9152
9153			} // namespace morphodita
9154
9155			/////////
9156			// File: morphodita/morpho/morpho_statistical_guesser_trainer.h
9157			/////////
9158
9159			// This file is part of MorphoDiTa .
9160			//
9161			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9162			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9163			//
9164			// This Source Code Form is subject to the terms of the Mozilla Public
9165			// License, v. 2.0. If a copy of the MPL was not distributed with this
9166			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9167
9168			namespace morphodita {
9169
9170			class morpho_statistical_guesser_trainer {
9171			public:
9172			static void train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os);
9173
9174			private:
9175	0		struct instance {
9176			string form, lemma, tag;
9177			string lemma_rule, form_prefix;
9178
9179			instance(const string& form, const string& lemma, const string& tag);
9180			};
9181
9182			enum casing { CASE_LC, CASE_UCLC, CASE_UC, CASE_OTHER };
9183			static casing get_casing(const string& word, bool allow_nonletters);
9184			static void set_casing(const string& original, casing c, string& word);
9185			static bool suffix(const string& word, unsigned& length);
9186			};
9187
9188			} // namespace morphodita
9189
9190			/////////
9191			// File: morphodita/morpho/morpho_statistical_guesser_trainer.cpp
9192			/////////
9193
9194			// This file is part of MorphoDiTa .
9195			//
9196			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9197			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9198			//
9199			// This Source Code Form is subject to the terms of the Mozilla Public
9200			// License, v. 2.0. If a copy of the MPL was not distributed with this
9201			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9202
9203			namespace morphodita {
9204
9205	0		void morpho_statistical_guesser_trainer::train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os) {
9206	0		vector data;
9207
9208			// Load training data
9209			string form;
9210	0		vector tokens;
9211	0	0	for (string line; getline(is, line);) {
		0
9212	0	0	if (line.empty()) continue;
9213
9214	0	0	split(line, '\t', tokens);
9215	0	0	if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!");
		0
		0
9216	0	0	if (tokens[0].empty() \|\| tokens[1].empty() \|\| tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!");
		0
		0
		0
		0
		0
9217
9218			// Normalize case
9219	0		casing form_case = get_casing(tokens[0], false);
9220	0		casing lemma_case = get_casing(tokens[1], true);
9221	0	0	if ((lemma_case == CASE_LC && (form_case == CASE_UCLC \|\| form_case == CASE_UC)) \|\|
		0
		0
9222	0		(lemma_case == CASE_UCLC && form_case == CASE_UC)) {
9223	0	0	set_casing(tokens[0], lemma_case, form);
9224			} else {
9225	0		form.swap(tokens[0]);
9226			}
9227
9228	0	0	data.emplace_back(form, tokens[1], tokens[2]);
9229			}
9230
9231			// Generate at most max_prefixes prefixes with min_prefix_count
9232			unordered_map> prefixes_with_forms;
9233	0	0	for (auto&& instance : data)
9234	0	0	if (!instance.form_prefix.empty())
9235	0		prefixes_with_forms[instance.form_prefix].insert(instance.form);
9236
9237	0		vector> prefixes_with_counts;
9238	0	0	for (auto&& prefix : prefixes_with_forms)
9239	0	0	if (prefix.second.size() >= min_prefix_count)
9240	0	0	prefixes_with_counts.emplace_back(unsigned(prefix.second.size()), prefix.first);
9241
9242	0	0	if (prefixes_with_counts.size() > max_prefixes) {
9243			sort(prefixes_with_counts.begin(), prefixes_with_counts.end(), greater>());
9244	0	0	prefixes_with_counts.resize(max_prefixes);
9245			}
9246
9247			unordered_set prefixes;
9248			prefixes.emplace();
9249	0	0	for (auto&& prefix : prefixes_with_counts)
9250	0		prefixes.insert(prefix.second);
9251
9252			// Generate the guesser rules
9253			unordered_map> tags;
9254			unordered_map>> rules;
9255			unordered_set suffixes;
9256			string prefix_suffix, tag_lemma_rule;
9257	0	0	for (auto&& instance : data) {
9258			// Add tag
9259	0		tags[instance.tag].insert(instance.form);
9260
9261			// Find longest matching prefix
9262			unsigned prefix_length = 0;
9263	0	0	for (auto&& prefix : prefixes)
9264	0	0	if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0)
		0
		0
		0
9265	0		prefix_length = prefix.size();
9266
9267	0	0	tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag);
9268
9269			// Add prefix + all suffixes of length 1..suffix_len to rules
9270	0	0	for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) {
		0
		0
9271	0	0	prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length);
		0
		0
9272			rules[prefix_suffix][tag_lemma_rule].insert(instance.form);
9273	0		suffixes.emplace(instance.form, instance.form.size() - utf8_length, utf8_length);
9274			}
9275			}
9276
9277			// Start generating the guesser description by writing the most "frequent" tag
9278			string most_frequent_tag; unsigned most_frequent_tag_count = 0;
9279	0	0	for (auto&& tag : tags)
9280	0	0	if (tag.second.size() > most_frequent_tag_count)
9281	0		most_frequent_tag.assign(tag.first), most_frequent_tag_count = tag.second.size();
9282
9283			os << most_frequent_tag << endl;
9284
9285			// For every prefix-suffix, write at most rules_per_suffix most "frequent" rules
9286			string rule_key, output;
9287			unordered_set rules_set;
9288	0		vector> rules_counts;
9289	0	0	for (auto&& suffix : suffixes) {
9290	0	0	for (auto&& prefix : prefixes) {
9291	0		rules_counts.clear();
9292			rules_set.clear();
9293
9294			// Gather at most rules_per_suffix rules
9295	0	0	for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) {
		0
9296	0	0	for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) {
		0
		0
9297	0	0	rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len);
		0
		0
9298	0	0	if (!rules.count(rule_key)) continue;
9299
9300			unsigned rules_counts_original = rules_counts.size();
9301	0	0	for (auto&& entry : rules[rule_key])
9302	0	0	if (!rules_set.count(entry.first)) {
9303	0	0	rules_counts.emplace_back(unsigned(entry.second.size()), entry.first);
9304			rules_set.insert(entry.first);
9305			}
9306
9307			sort(rules_counts.begin() + rules_counts_original, rules_counts.end(), greater>());
9308
9309	0	0	if (rules_counts.size() >= rules_per_suffix) {
9310	0	0	rules_counts.resize(rules_per_suffix);
9311			break;
9312			}
9313			}
9314			// Stop if there are no rules for given prefix
9315	0	0	if (rules_set.empty()) break;
9316			}
9317	0	0	if (!rules_set.empty()) {
9318			// Write the chosen rules
9319	0	0	output.assign(prefix).append(" ").append(suffix);
9320	0	0	for (unsigned i = 0; i < rules_counts.size(); i++) {
9321	0		unsigned tab = rules_counts[i].second.find('\t');
9322
9323	0	0	output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos);
		0
		0
		0
9324
9325			// Join rules with same lemma_rule
9326	0	0	for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++)
		0
		0
		0
9327	0	0	output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos);
		0
9328			}
9329			os << output << endl;
9330			}
9331			}
9332			}
9333	0		}
9334
9335	0		morpho_statistical_guesser_trainer::instance::instance(const string& form, const string& lemma, const string& tag)
9336	0		: form(form), lemma(lemma), tag(tag)
9337			{
9338			using namespace unilib;
9339
9340			unsigned length_best = 0;
9341			int form_best = 0, lemma_best = 0;
9342	0	0	for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) {
9343	0		unsigned form_offset = max(0, offset);
9344	0		unsigned lemma_offset = max(0, -offset);
9345	0	0	for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++)
		0
		0
9346	0	0	if (form[form_offset] == lemma[lemma_offset]) {
9347	0	0	if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length))
		0
		0
9348	0		length_best = length, form_best = form_offset + 1 - length, lemma_best = lemma_offset + 1 - length;
9349			} else {
9350			length = 0;
9351			}
9352			}
9353
9354	0	0	form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0);
		0
9355	0	0	lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ")
		0
		0
		0
9356	0	0	.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos);
		0
		0
9357	0		}
9358
9359	0		morpho_statistical_guesser_trainer::casing morpho_statistical_guesser_trainer::get_casing(const string& word, bool allow_nonletters) {
9360			using namespace unilib;
9361
9362			casing c = CASE_OTHER;
9363			int index = 0;
9364	0	0	for (auto&& chr : utf8::decoder(word)) {
9365	0		auto cat = unicode::category(chr);
9366
9367			// Return OTHER for non-letters
9368	0	0	if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue;
		0
9369	0	0	if (cat & ~unicode::L) return CASE_OTHER;
9370
9371	0	0	if (index == 0) {
9372	0	0	c = cat & unicode::Ll ? CASE_LC : CASE_UC;
9373	0	0	} else if (c == CASE_UC && index == 1) {
9374	0	0	c = cat & unicode::Ll ? CASE_UCLC : CASE_UC;
9375	0	0	} else if (c == CASE_UC) {
9376	0	0	if (cat & ~unicode::Lut) return CASE_OTHER;
9377			} else /CASE_LC or CASE_UCLC/ {
9378	0	0	if (cat & ~unicode::Ll) return CASE_OTHER;
9379			}
9380	0		index++;
9381			}
9382	0		return c;
9383			}
9384
9385	0		void morpho_statistical_guesser_trainer::set_casing(const string& original, casing c, string& word) {
9386			using namespace unilib;
9387
9388			word.clear();
9389			bool first = true;
9390	0	0	for (auto&& chr : utf8::decoder(original)) {
9391	0	0	utf8::append(word, (c == CASE_UC \|\| (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr));
		0
9392			first = false;
9393			}
9394	0		}
9395
9396	0		bool morpho_statistical_guesser_trainer::suffix(const string& word, unsigned& length) {
9397			using namespace unilib;
9398
9399			unsigned additional = 1;
9400	0	0	while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional))
		0
		0
9401	0		additional++;
9402
9403	0	0	if (additional + length > word.size()) return false;
9404
9405	0		length += additional;
9406	0		return true;
9407			}
9408
9409			} // namespace morphodita
9410
9411			/////////
9412			// File: morphodita/morpho/raw_morpho_dictionary_reader.cpp
9413			/////////
9414
9415			// This file is part of MorphoDiTa .
9416			//
9417			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9418			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9419			//
9420			// This Source Code Form is subject to the terms of the Mozilla Public
9421			// License, v. 2.0. If a copy of the MPL was not distributed with this
9422			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9423
9424			namespace morphodita {
9425
9426	0		bool raw_morpho_dictionary_reader::next_lemma(string& lemma, vector>& tagged_forms) {
9427	0	0	if (line.empty()) {
9428	0	0	if (!getline(in, line))
9429			return false;
9430	0		split(line, '\t', tokens);
9431	0	0	if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
		0
		0
9432			}
9433
9434			lemma = tokens[0];
9435	0	0	if (seen_lemmas.count(lemma))
9436	0	0	training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!");
		0
9437			seen_lemmas.insert(lemma);
9438
9439			tagged_forms.clear();
9440	0		tagged_forms.emplace_back(tokens[2], tokens[1]);
9441	0	0	while (getline(in, line)) {
9442	0		split(line, '\t', tokens);
9443	0	0	if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!");
		0
		0
9444
9445	0	0	if (lemma != tokens[0]) break;
9446	0		tagged_forms.emplace_back(tokens[2], tokens[1]);
9447			}
9448
9449			return true;
9450			}
9451
9452			} // namespace morphodita
9453
9454			/////////
9455			// File: morphodita/morpho/tag_filter.cpp
9456			/////////
9457
9458			// This file is part of MorphoDiTa .
9459			//
9460			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9461			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9462			//
9463			// This Source Code Form is subject to the terms of the Mozilla Public
9464			// License, v. 2.0. If a copy of the MPL was not distributed with this
9465			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9466
9467			namespace morphodita {
9468
9469	0		tag_filter::tag_filter(const char* filter) {
9470	0	0	if (!filter) return;
9471
9472	0	0	wildcard.assign(filter);
9473			filter = wildcard.c_str();
9474
9475	0	0	for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) {
9476	0	0	if (filter[filter_pos] == '?') continue;
9477	0	0	if (filter[filter_pos] == '[') {
9478	0		filter_pos++;
9479
9480	0		bool negate = false;
9481	0	0	if (filter[filter_pos] == '^') negate = true, filter_pos++;
9482
9483	0		int chars_start = filter_pos;
9484	0	0	for (bool first = true; filter[filter_pos] && (first \|\| filter[filter_pos] != ']'); first = false)
		0
		0
9485	0		filter_pos++;
9486
9487	0	0	filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start);
9488	0	0	if (!filter[filter_pos]) break;
9489			} else {
9490	0	0	filters.emplace_back(tag_pos, false, filter_pos, 1);
9491			}
9492			}
9493			}
9494
9495			} // namespace morphodita
9496
9497			/////////
9498			// File: morphodita/tagger/elementary_features.h
9499			/////////
9500
9501			// This file is part of MorphoDiTa .
9502			//
9503			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9504			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9505			//
9506			// This Source Code Form is subject to the terms of the Mozilla Public
9507			// License, v. 2.0. If a copy of the MPL was not distributed with this
9508			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9509
9510			namespace morphodita {
9511
9512			// Declarations
9513			enum elementary_feature_type { PER_FORM, PER_TAG, DYNAMIC };
9514			enum elementary_feature_range { ONLY_CURRENT, ANY_OFFSET };
9515
9516			typedef uint32_t elementary_feature_value;
9517			enum :elementary_feature_value { elementary_feature_unknown = 0, elementary_feature_empty = 1 };
9518
9519	136		struct elementary_feature_description {
9520			string name;
9521			elementary_feature_type type;
9522			elementary_feature_range range;
9523			int index;
9524			int map_index;
9525			};
9526
9527			template
9528	1		class elementary_features {
9529			public:
9530			bool load(istream& is);
9531			bool save(ostream& out);
9532
9533			vector maps;
9534			};
9535
9536	0		class persistent_elementary_feature_map : public persistent_unordered_map {
9537			public:
9538			persistent_elementary_feature_map() : persistent_unordered_map() {}
9539			persistent_elementary_feature_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {}
9540
9541			elementary_feature_value value(const char* feature, int len) const {
9542	92		auto* it = at_typed(feature, len);
9543	92	0	return it ? unaligned_load(it) : elementary_feature_unknown;
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		50
		100
		50
		100
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
		50
9544			}
9545			};
9546
9547			// Definitions
9548			template
9549	1		inline bool elementary_features::load(istream& is) {
9550			binary_decoder data;
9551	1	50	if (!compressor::load(is, data)) return false;
		50
9552
9553			try {
9554	1	50	maps.resize(data.next_1B());
		50
9555	28	100	for (auto&& map : maps)
9556	27	50	map.load(data);
		0
9557			} catch (binary_decoder_error&) {
9558			return false;
9559			}
9560
9561	1		return data.is_end();
9562			}
9563
9564			} // namespace morphodita
9565
9566			/////////
9567			// File: morphodita/tagger/vli.h
9568			/////////
9569
9570			// This file is part of MorphoDiTa .
9571			//
9572			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9573			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9574			//
9575			// This Source Code Form is subject to the terms of the Mozilla Public
9576			// License, v. 2.0. If a copy of the MPL was not distributed with this
9577			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9578
9579			namespace morphodita {
9580
9581			// Declarations
9582			template
9583			class vli {
9584			public:
9585			static int max_length();
9586			static void encode(T value, char*& where);
9587			static T decode(const char*& from);
9588			};
9589
9590			// Definitions
9591			template <>
9592			inline int vli::max_length() {
9593			return 5;
9594			}
9595
9596			template <>
9597	1171		inline void vli::encode(uint32_t value, char*& where) {
9598	1171	50	if (value < 0x80) *where++ = value;
9599	0	0	else if (value < 0x4000) where++ = (value >> 7) \| 0x80u, where++ = value & 0x7Fu;
9600	0	0	else if (value < 0x200000) where++ = (value >> 14) \| 0x80u, where++ = ((value >> 7) & 0x7Fu) \| 0x80u, *where++ = value & 0x7Fu;
9601	0	0	else if (value < 0x10000000) where++ = (value >> 21) \| 0x80u, where++ = ((value >> 14) & 0x7Fu) \| 0x80u, where++ = ((value >> 7) & 0x7Fu) \| 0x80u, where++ = value & 0x7Fu;
9602	0		else where++ = (value >> 28) \| 0x80u, where++ = ((value >> 21) & 0x7Fu) \| 0x80u, where++ = ((value >> 14) & 0x7Fu) \| 0x80u, where++ = ((value >> 7) & 0x7Fu) \| 0x80u, *where++ = value & 0x7Fu;
9603	1171		}
9604
9605			template <>
9606			inline uint32_t vli::decode(const char*& from) {
9607			uint32_t value = 0;
9608	0	0	while (((unsigned char)(from)) & 0x80u) value = (value << 7) \| (((unsigned char)(from++)) ^ 0x80u);
		0
9609	0		value = (value << 7) \| ((unsigned char)(*from++));
9610			return value;
9611			}
9612
9613			} // namespace morphodita
9614
9615			/////////
9616			// File: morphodita/tagger/feature_sequences.h
9617			/////////
9618
9619			// This file is part of MorphoDiTa .
9620			//
9621			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9622			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9623			//
9624			// This Source Code Form is subject to the terms of the Mozilla Public
9625			// License, v. 2.0. If a copy of the MPL was not distributed with this
9626			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9627
9628			namespace morphodita {
9629
9630			// Declarations
9631			typedef int32_t feature_sequence_score;
9632			typedef int64_t feature_sequences_score;
9633
9634			struct feature_sequence_element {
9635			elementary_feature_type type;
9636			int elementary_index;
9637			int sequence_index;
9638
9639			feature_sequence_element() {}
9640	0		feature_sequence_element(elementary_feature_type type, int elementary_index, int sequence_index) : type(type), elementary_index(elementary_index), sequence_index(sequence_index) {}
9641			};
9642
9643	74	0	struct feature_sequence {
		0
9644			vector elements;
9645			int dependant_range = 1;
9646			};
9647
9648			template
9649	3	0	class feature_sequences {
		0
		50
		0
		0
9650			public:
9651			typedef typename ElementaryFeatures::per_form_features per_form_features;
9652			typedef typename ElementaryFeatures::per_tag_features per_tag_features;
9653			typedef typename ElementaryFeatures::dynamic_features dynamic_features;
9654
9655			void parse(int window_size, istream& is);
9656			bool load(istream& is);
9657			bool save(ostream& os);
9658
9659			struct cache;
9660
9661			inline void initialize_sentence(const vector& forms, const vector>& analyses, cache& c) const;
9662			inline void compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const;
9663			inline feature_sequences_score score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const;
9664			void feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector& keys, cache& c) const;
9665
9666			ElementaryFeatures elementary;
9667			vector scores;
9668			vector sequences;
9669			};
9670
9671	0		class persistent_feature_sequence_map : public persistent_unordered_map {
9672			public:
9673			persistent_feature_sequence_map() : persistent_unordered_map() {}
9674			persistent_feature_sequence_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {}
9675
9676			feature_sequence_score score(const char* feature, int len) const {
9677	346		auto* it = at_typed(feature, len);
9678	346	0	return it ? unaligned_load(it) : 0;
		0
		100
9679			}
9680			};
9681
9682			template using persistent_feature_sequences = feature_sequences;
9683
9684			// Definitions
9685			template
9686	1		inline bool feature_sequences::load(istream& is) {
9687	1	50	if (!elementary.load(is)) return false;
		0
		0
9688
9689			binary_decoder data;
9690	1	50	if (!compressor::load(is, data)) return false;
		50
		0
		0
		0
		0
9691
9692			try {
9693	1	50	sequences.resize(data.next_1B());
		50
		0
		0
		0
		0
9694	75	100	for (auto&& sequence : sequences) {
		0
		0
9695	74	50	sequence.dependant_range = data.next_4B();
		0
		0
9696	74	50	sequence.elements.resize(data.next_1B());
		50
		0
		0
		0
		0
9697	228	100	for (auto&& element : sequence.elements) {
		0
		0
9698	154	50	element.type = elementary_feature_type(data.next_4B());
		0
		0
9699	154	50	element.elementary_index = data.next_4B();
		0
		0
9700	154	50	element.sequence_index = data.next_4B();
		0
		0
9701			}
9702			}
9703
9704	1	50	scores.resize(data.next_1B());
		50
		0
		0
		0
		0
9705	75	100	for (auto&& score : scores)
		0
		0
9706	74	50	score.load(data);
		0
		0
		0
		0
		0
9707			} catch (binary_decoder_error&) {
9708			return false;
9709			}
9710
9711	1		return data.is_end();
9712			}
9713
9714			template
9715	2		struct feature_sequences::cache {
9716			const vector* forms;
9717			const vector>* analyses;
9718			vector elementary_per_form;
9719			vector> elementary_per_tag;
9720
9721	0		struct cache_element {
9722			vector key;
9723			int key_size;
9724			feature_sequence_score score;
9725
9726	74	0	cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {}
		0
		0
		0
9727			};
9728			vector caches;
9729			vector window;
9730			vector key;
9731			feature_sequences_score score;
9732
9733	1		cache(const feature_sequences& self) : score(0) {
9734	1	0	caches.reserve(self.sequences.size());
		0
		50
		0
9735			int max_sequence_elements = 0, max_window_size = 1;
9736	75	0	for (auto&& sequence : self.sequences) {
		0
		100
		0
9737	74	0	caches.emplace_back(int(sequence.elements.size()));
		0
		50
		0
9738	74	0	if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size();
		0
		100
		0
9739	228	0	for (auto&& element : sequence.elements)
		0
		100
		0
9740	154	0	if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size)
		0
		0
		0
		100
		100
		0
		0
9741			max_window_size = 1 - element.sequence_index;
9742			}
9743	1	0	key.resize(max_sequence_elements * vli::max_length());
		0
		50
		0
9744	1	0	window.resize(max_window_size);
		0
		50
		0
9745	1		}
9746			};
9747
9748			template
9749	1		void feature_sequences::initialize_sentence(const vector& forms, const vector>& analyses, cache& c) const {
9750			// Store forms and forms_size
9751	1		c.forms = &forms;
9752	1		c.analyses = &analyses;
9753
9754			// Enlarge elementary features vectors if needed
9755	1	0	if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2);
		0
		50
		0
9756	1	0	if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2);
		0
		50
		0
9757	8	0	for (unsigned i = 0; i < forms.size(); i++)
		0
		100
		0
9758	7	0	if (analyses[i].size() > c.elementary_per_tag[i].size())
		0
		50
		0
9759	7		c.elementary_per_tag[i].resize(analyses[i].size() * 2);
9760
9761			// Compute elementary features
9762	1		elementary.compute_features(forms, analyses, c.elementary_per_form, c.elementary_per_tag);
9763
9764			// Clear score cache, because scores may have been modified
9765	1		c.score = 0;
9766	75	0	for (auto&& cache : c.caches)
		0
		100
		0
9767	74		cache.key_size = cache.score = 0;
9768	1		}
9769
9770			template
9771	30		void feature_sequences::compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const {
9772	15	0	elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic);
		0
		100
		0
9773	15		}
9774
9775			template
9776	26		feature_sequences_score feature_sequences::score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const {
9777			// Start by creating a window of per_tag_features*
9778	43	0	for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++)
		0
		0
		0
		0
		0
		100
		100
		100
		0
		0
		0
9779	90		c.window[i] = &c.elementary_per_tag[form_index - i][tags_window[i]];
9780
9781			// Compute the score
9782	13		feature_sequences_score result = c.score;
9783	671	0	for (unsigned i = 0; i < sequences.size(); i++) {
		0
		100
		0
9784	658	0	if (tags_unchanged >= sequences[i].dependant_range)
		0
		100
		0
9785			break;
9786
9787	653		char* key = c.key.data();
9788	1824	0	for (unsigned j = 0; j < sequences[i].elements.size(); j++) {
		0
		100
		0
9789			auto& element = sequences[i].elements[j];
9790			elementary_feature_value value;
9791
9792	1345		switch (element.type) {
9793			case PER_FORM:
9794	475	0	value = form_index + element.sequence_index < 0 \|\| unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index];
		0
		0
		0
		100
		100
		0
		0
9795			break;
9796			case PER_TAG:
9797	844	0	value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index];
		0
		100
		0
9798			break;
9799			case DYNAMIC:
9800			default:
9801	26		value = dynamic.values[element.elementary_index];
9802			}
9803
9804	1345	0	if (value == elementary_feature_unknown) {
		0
		100
		0
9805	174		key = c.key.data();
9806	174		break;
9807			}
9808	1171		vli::encode(value, key);
9809			}
9810
9811	653		result -= c.caches[i].score;
9812	653		int key_size = key - c.key.data();
9813	653	0	if (!key_size) {
		0
		100
		0
9814	174		c.caches[i].score = 0;
9815	174		c.caches[i].key_size = 0;
9816	834	0	} else if (key_size != c.caches[i].key_size \|\| !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) {
		0
		0
		0
		0
		0
		100
		100
		100
		0
		0
		0
9817	0		c.caches[i].score = scores[i].score(c.key.data(), key_size);
9818	346		c.caches[i].key_size = key_size;
9819	346		small_memcpy(c.caches[i].key.data(), c.key.data(), key_size);
9820			}
9821	653		result += c.caches[i].score;
9822			}
9823
9824	13		c.score = result;
9825	13		return result;
9826			}
9827
9828			template
9829	0		void feature_sequences::feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector& keys, cache& c) const {
9830	0		score(form_index, tags_window, tags_unchanged, dynamic, c);
9831
9832	0		keys.resize(c.caches.size());
9833	0	0	for (unsigned i = 0; i < c.caches.size(); i++)
9834	0		keys[i].assign(c.caches[i].key.data(), c.caches[i].key_size);
9835	0		}
9836
9837			} // namespace morphodita
9838
9839			/////////
9840			// File: morphodita/tagger/viterbi.h
9841			/////////
9842
9843			// This file is part of MorphoDiTa .
9844			//
9845			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9846			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9847			//
9848			// This Source Code Form is subject to the terms of the Mozilla Public
9849			// License, v. 2.0. If a copy of the MPL was not distributed with this
9850			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9851
9852			namespace morphodita {
9853
9854			// Declarations
9855			template
9856			class viterbi {
9857			public:
9858			viterbi(const FeatureSequences& features, int decoding_order, int window_size)
9859	1		: features(features), decoding_order(decoding_order), window_size(window_size) {}
9860
9861			struct cache;
9862			void tag(const vector& forms, const vector>& analyses, cache& c, vector& tags) const;
9863
9864			private:
9865			struct node;
9866
9867			const FeatureSequences& features;
9868			int decoding_order, window_size;
9869			};
9870
9871			// Definitions
9872			template
9873	2		struct viterbi::cache {
9874			vector nodes;
9875			typename FeatureSequences::cache features_cache;
9876
9877	1	0	cache(const viterbi& self) : features_cache(self.features) {}
		0
		50
		0
9878			};
9879
9880			template
9881			struct viterbi::node {
9882			int tag;
9883			int prev;
9884			feature_sequences_score score;
9885			typename FeatureSequences::dynamic_features dynamic;
9886			};
9887
9888			template
9889	1		void viterbi::tag(const vector& forms, const vector>& analyses, cache& c, vector& tags) const {
9890	2	0	if (!forms.size()) return;
		0
		50
		0
9891
9892			// Count number of nodes and allocate
9893			unsigned nodes = 0;
9894	8	0	for (unsigned i = 0, states = 1; i < forms.size(); i++) {
		0
		100
		0
9895	7	0	if (analyses[i].empty()) return;
		0
		50
		0
9896	7	0	states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size();
		0
		100
		0
9897	7		nodes += states;
9898			}
9899	1	0	if (nodes > c.nodes.size()) c.nodes.resize(nodes);
		0
		50
		0
9900
9901			// Init feature sequences
9902	1		features.initialize_sentence(forms, analyses, c.features_cache);
9903
9904			int window_stack[16]; vector window_heap;
9905	1	0	int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data());
		0
		0
		0
		50
		0
		0
		0
9906			typename FeatureSequences::dynamic_features dynamic;
9907			feature_sequences_score score;
9908
9909			// Compute all nodes score
9910			int nodes_prev = -1, nodes_now = 0;
9911	8	0	for (unsigned i = 0; i < forms.size(); i++) {
		0
		100
		0
9912			int nodes_next = nodes_now;
9913
9914	28	0	for (int j = 0; j < window_size; j++) window[j] = -1;
		0
		100
		0
9915	17	0	for (int tag = 0; tag < int(analyses[i].size()); tag++)
		0
		100
		0
9916	25	0	for (int prev = nodes_prev; prev < nodes_now; prev++) {
		0
		100
		0
9917			// Compute predecessors and number of unchanges
9918	15		int same_tags = window[0] == tag;
9919	15		window[0] = tag;
9920	36	0	for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) {
		0
		0
		0
		100
		100
		0
		0
9921	21	0	same_tags += same_tags == n && window[n] == c.nodes[p].tag;
		0
		0
		0
		100
		100
		0
		0
9922	42		window[n] = c.nodes[p].tag;
9923			}
9924
9925			// Compute dynamic elementary features and score
9926	15	0	features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache);
		0
		100
		0
9927	15	0	score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) +
		0
		0
		0
		0
		0
		100
		100
		100
		0
		0
		0
		0
9928	12		(prev >= 0 ? c.nodes[prev].score : 0);
9929
9930			// Update existing node or create a new one
9931	15	0	if (same_tags >= decoding_order-1) {
		0
		100
		0
9932	2	0	if (score <= c.nodes[nodes_next-1].score) continue;
		0
		100
		0
9933			nodes_next--;
9934			}
9935	28		c.nodes[nodes_next].tag = tag;
9936	14		c.nodes[nodes_next].prev = prev;
9937	14		c.nodes[nodes_next].score = score;
9938	14		c.nodes[nodes_next++].dynamic = dynamic;
9939			}
9940
9941			nodes_prev = nodes_now;
9942			nodes_now = nodes_next;
9943			}
9944
9945			// Choose the best ending node
9946			int best = nodes_prev;
9947	2	0	for (int node = nodes_prev + 1; node < nodes_now; node++)
		0
		100
		0
9948	1	0	if (c.nodes[node].score > c.nodes[best].score)
		0
		50
		0
9949			best = node;
9950
9951	8	0	for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev)
		0
		100
		0
9952	21		tags[i] = c.nodes[best].tag;
9953			}
9954
9955			} // namespace morphodita
9956
9957			/////////
9958			// File: morphodita/tagger/conllu_elementary_features.h
9959			/////////
9960
9961			// This file is part of MorphoDiTa .
9962			//
9963			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
9964			// Mathematics and Physics, Charles University in Prague, Czech Republic.
9965			//
9966			// This Source Code Form is subject to the terms of the Mozilla Public
9967			// License, v. 2.0. If a copy of the MPL was not distributed with this
9968			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
9969
9970			namespace morphodita {
9971
9972			// Declarations
9973			template
9974	1		class conllu_elementary_features : public elementary_features {
9975			public:
9976			conllu_elementary_features();
9977
9978			enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_FORM, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL };
9979			enum features_per_tag { TAG, TAG_UPOS, TAG_CASE, TAG_GENDER, TAG_NUMBER, TAG_NEGATIVE, TAG_PERSON, LEMMA, PER_TAG_TOTAL };
9980			enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_FORM, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_FORM, DYNAMIC_TOTAL };
9981			enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG_UPOS, MAP_TAG_CASE, MAP_TAG_GENDER, MAP_TAG_NUMBER, MAP_TAG_NEGATIVE, MAP_TAG_PERSON, MAP_LEMMA, MAP_TOTAL } ;
9982
9983			struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; };
9984			struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; };
9985			struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; };
9986
9987			static vector descriptions;
9988
9989			void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const;
9990			inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const;
9991
9992			using elementary_features::maps;
9993			};
9994
9995			typedef conllu_elementary_features persistent_conllu_elementary_features;
9996
9997			// Definitions
9998			template
9999	1		conllu_elementary_features::conllu_elementary_features() {
10000	1	0	maps.resize(MAP_TOTAL);
		50
10001	1		}
10002
10003			template
10004	70	50	vector conllu_elementary_features::descriptions = {
		50
		100
		0
10005			{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM},
10006			{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG},
10007			{"FollowingVerbForm", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_FORM, MAP_FORM},
10008			{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE},
10009			{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE},
10010			{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE},
10011			{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1},
10012			{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2},
10013			{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3},
10014			{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4},
10015			{"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5},
10016			{"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6},
10017			{"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7},
10018			{"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8},
10019			{"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9},
10020			{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1},
10021			{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2},
10022			{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3},
10023			{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4},
10024			{"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5},
10025			{"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6},
10026			{"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7},
10027			{"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8},
10028			{"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9},
10029
10030			{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG},
10031			{"TagUPos", PER_TAG, ANY_OFFSET, TAG_UPOS, MAP_TAG_UPOS},
10032			{"TagCase", PER_TAG, ANY_OFFSET, TAG_CASE, MAP_TAG_CASE},
10033			{"TagGender", PER_TAG, ANY_OFFSET, TAG_GENDER, MAP_TAG_GENDER},
10034			{"TagNumber", PER_TAG, ANY_OFFSET, TAG_NUMBER, MAP_TAG_NUMBER},
10035			{"TagNegative", PER_TAG, ANY_OFFSET, TAG_NEGATIVE, MAP_TAG_NEGATIVE},
10036			{"TagPerson", PER_TAG, ANY_OFFSET, TAG_PERSON, MAP_TAG_PERSON},
10037			{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA},
10038
10039			{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG},
10040			{"PreviousVerbForm", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_FORM, MAP_FORM},
10041			};
10042
10043			template
10044	1		void conllu_elementary_features::compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const {
10045			using namespace unilib;
10046
10047			// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly.
10048			elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_form = elementary_feature_empty;
10049	8	100	for (unsigned i = forms.size(); i--;) {
		0
10050			int verb_candidate = -1;
10051
10052			// Per_tag features and verb_candidate
10053	17	100	for (unsigned j = 0; j < analyses[i].size(); j++) {
		0
10054	10		const string& tag = analyses[i][j].tag;
10055	10		const string& lemma = analyses[i][j].lemma;
10056
10057			// Tag consists of three parts separated by tag[0] character
10058			// - first is TAG_UPOS,
10059			// - second is TAG_LPOS,
10060			// - then there is any number of \| separated named fields in format Name=Value
10061	0		per_tag[i][j].values[TAG] = maps[MAP_TAG].value(tag.c_str(), tag.size());
10062	10		per_tag[i][j].values[TAG_UPOS] = per_tag[i][j].values[TAG_CASE] = per_tag[i][j].values[TAG_GENDER] = elementary_feature_empty;
10063	10		per_tag[i][j].values[TAG_NUMBER] = per_tag[i][j].values[TAG_NEGATIVE] = per_tag[i][j].values[TAG_PERSON] = elementary_feature_empty;
10064	10	100	per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] :
		50
		0
		0
10065			maps[MAP_LEMMA].value(lemma.c_str(), lemma.size());
10066
10067	10		char separator = tag[0];
10068	10		size_t index = tag.find(separator, 1);
10069	10	50	if (index == string::npos) index = tag.size();
		0
10070	10	50	per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0));
		50
		0
		0
10071
10072	10	50	if (index < tag.size()) index++;
		0
10073	10	50	if (index < tag.size()) index = tag.find(separator, index);
		0
10074	10	50	if (index < tag.size()) index++;
		0
10075	50	100	for (size_t length; index < tag.size(); index += length + 1) {
		0
10076	40		length = tag.find('\|', index);
10077	40	100	length = (length == string::npos ? tag.size() : length) - index;
		0
10078
10079	280	50	for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++)
		0
10080	280	100	if (tag[index + equal_sign] == '=') {
		0
10081			int value = -1, map;
10082	40		switch (equal_sign) {
10083			case 4:
10084	6	100	if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE;
		0
10085			break;
10086			case 6:
10087	16	100	if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER;
		0
10088	16	100	if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER;
		0
10089	16	100	if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON;
		0
10090			break;
10091			case 8:
10092	10	100	if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE;
		0
10093			break;
10094			}
10095
10096	40	100	if (value >= 0)
		0
10097	19		per_tag[i][j].values[value] = maps[map].value(tag.c_str() + index + equal_sign + 1, length - equal_sign - 1);
10098			break;
10099			}
10100			}
10101
10102	10	50	if (tag.size() >= 2 && tag[1] == 'V') {
		100
		100
		0
		0
		0
10103			int tag_compare;
10104	5	100	verb_candidate = verb_candidate < 0 \|\| (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) \|\| (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
		50
		0
		0
10105			}
10106			}
10107
10108			// Per_form features
10109	0		per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len);
10110	7		per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag;
10111	7		per_form[i].values[FOLLOWING_VERB_FORM] = following_verb_form;
10112
10113			// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}.
10114	7		if (verb_candidate >= 0) {
10115	4		following_verb_tag = per_tag[i][verb_candidate].values[TAG];
10116	2		following_verb_form = per_form[i].values[FORM];
10117			}
10118
10119			// Ortographic per_form features if needed
10120	7	100	if (analyses[i].size() == 1) {
		0
10121	5		per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown;
10122	5		per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown;
10123	5		per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown;
10124	5		per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown;
10125	5		per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown;
10126	5		per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown;
10127	5		per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown;
10128	2	50	} else if (forms[i].len <= 0) {
		0
10129	0		per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1;
10130	0		per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty;
10131	0		per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty;
10132	0		per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty;
10133	0		per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty;
10134	0		per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty;
10135	0		per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty;
10136			} else {
10137	2		string_piece form = forms[i];
10138	2		const char* form_start = form.str;
10139
10140			bool num = false, cap = false, dash = false;
10141	18		size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters
10142			int index = 0;
10143	18	100	while (form.len) {
		0
10144	16		indices[(index++) % 18] = form.str - form_start;
10145
10146	16		unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len));
10147	16	50	num = num \|\| cat & unicode::N;
		50
		0
		0
10148	16	100	cap = cap \|\| cat & unicode::Lut;
		100
		0
		0
10149	16	50	dash = dash \|\| cat & unicode::Pd;
		50
		0
		0
10150
10151	16	50	if (index == 10 \|\| (!form.len && index < 10)) {
		100
		50
		0
		0
		0
10152	0		per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]);
10153	0		per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]);
10154	0		per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]);
10155	0		per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]);
10156	0		per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]);
10157	0		per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]);
10158	0		per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]);
10159	0		per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]);
10160	2		per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]);
10161			}
10162			}
10163	0		per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]);
10164	0		per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]);
10165	0		per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]);
10166	0		per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]);
10167	0		per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]);
10168	0		per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]);
10169	0		per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]);
10170	0		per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]);
10171	0		per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]);
10172	2		per_form[i].values[NUM] = elementary_feature_empty + 1 + num;
10173	2		per_form[i].values[CAP] = elementary_feature_empty + 1 + cap;
10174	2		per_form[i].values[DASH] = elementary_feature_empty + 1 + dash;
10175			}
10176			}
10177	1		}
10178
10179			template
10180			void conllu_elementary_features::compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const {
10181	15	100	if (prev_dynamic) {
		0
10182	12		dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG];
10183	12		dynamic.values[PREVIOUS_VERB_FORM] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_FORM];
10184			} else {
10185	3		dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty;
10186	3		dynamic.values[PREVIOUS_VERB_FORM] = elementary_feature_empty;
10187			}
10188
10189	15	50	if (tag.tag.size() >= 2 && tag.tag[1] == 'V') {
		100
		100
		0
		0
		0
10190	4		dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG];
10191	4		dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = per_form.values[FORM];
10192			} else {
10193	11		dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG];
10194	11		dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = dynamic.values[PREVIOUS_VERB_FORM];
10195			}
10196			}
10197
10198			} // namespace morphodita
10199
10200			/////////
10201			// File: morphodita/tagger/czech_elementary_features.h
10202			/////////
10203
10204			// This file is part of MorphoDiTa .
10205			//
10206			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10207			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10208			//
10209			// This Source Code Form is subject to the terms of the Mozilla Public
10210			// License, v. 2.0. If a copy of the MPL was not distributed with this
10211			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10212
10213			namespace morphodita {
10214
10215			// Declarations
10216			template
10217	0		class czech_elementary_features : public elementary_features {
10218			public:
10219			czech_elementary_features();
10220
10221			enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, PER_FORM_TOTAL };
10222			enum features_per_tag { TAG, TAG3, TAG5, TAG25, LEMMA, PER_TAG_TOTAL };
10223			enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL };
10224			enum features_map { MAP_NONE = -1, MAP_FORM, MAP_LEMMA, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_TAG, MAP_TAG3, MAP_TAG5, MAP_TAG25, MAP_TOTAL } ;
10225
10226			struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; };
10227			struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; };
10228			struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; };
10229
10230			static vector descriptions;
10231
10232			void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const;
10233			inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const;
10234
10235			using elementary_features::maps;
10236			};
10237
10238			typedef czech_elementary_features persistent_czech_elementary_features;
10239
10240			// Definitions
10241			template
10242	0		czech_elementary_features::czech_elementary_features() {
10243	0	0	maps.resize(MAP_TOTAL);
10244	0		}
10245
10246			template
10247			vector czech_elementary_features::descriptions = {
10248			{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM},
10249			{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG},
10250			{"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA },
10251			{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE},
10252			{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE},
10253			{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE},
10254			{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1},
10255			{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2},
10256			{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3},
10257			{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4},
10258			{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1},
10259			{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2},
10260			{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3},
10261			{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4},
10262
10263			{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG},
10264			{"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3},
10265			{"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5},
10266			{"Tag25", PER_TAG, ANY_OFFSET, TAG25, MAP_TAG25},
10267			{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA},
10268
10269			{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG},
10270			{"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA}
10271			};
10272
10273			template
10274	0		void czech_elementary_features::compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const {
10275			using namespace unilib;
10276
10277			// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly.
10278			elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty;
10279	0	0	for (unsigned i = forms.size(); i--;) {
10280			int verb_candidate = -1;
10281
10282			// Per_tag features and verb_candidate
10283	0	0	for (unsigned j = 0; j < analyses[i].size(); j++) {
10284			char tag25[2];
10285	0		per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size());
10286	0	0	per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty;
10287	0	0	per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty;
10288	0	0	per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty;
10289	0	0	per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
		0
10290			maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size());
10291
10292	0	0	if (analyses[i][j].tag[0] == 'V') {
10293			int tag_compare;
10294	0	0	verb_candidate = verb_candidate < 0 \|\| (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) \|\| (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
		0
10295			}
10296			}
10297
10298			// Per_form features
10299	0		per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len);
10300	0		per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag;
10301	0		per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma;
10302
10303			// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}.
10304	0	0	if (verb_candidate >= 0) {
10305	0		following_verb_tag = per_tag[i][verb_candidate].values[TAG];
10306	0		following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA];
10307			}
10308
10309			// Ortographic per_form features if needed
10310	0	0	if (analyses[i].size() == 1) {
10311	0		per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown;
10312	0		per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_unknown;
10313	0		per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_unknown;
10314	0	0	} else if (forms[i].len <= 0) {
10315	0		per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1;
10316	0		per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_empty;
10317	0		per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_empty;
10318			} else {
10319	0		string_piece form = forms[i];
10320	0		const char* form_start = form.str;
10321
10322			bool num = false, cap = false, dash = false;
10323	0		size_t indices[8] = {0, form.len, form.len, form.len, form.len, 0, 0, 0}; // careful here regarding forms shorter than 4 characters
10324			int index = 0;
10325	0	0	while (form.len) {
10326	0		indices[(index++)&7] = form.str - form_start;
10327
10328	0		unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len));
10329	0	0	num = num \|\| cat & unicode::N;
		0
10330	0	0	cap = cap \|\| cat & unicode::Lut;
		0
10331	0	0	dash = dash \|\| cat & unicode::Pd;
		0
10332
10333	0	0	if (index == 5 \|\| (!form.len && index < 5)) {
		0
		0
10334	0		per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]);
10335	0		per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]);
10336	0		per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]);
10337	0		per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]);
10338			}
10339			}
10340	0		per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index-1)&7], form.str - form_start - indices[(index-1)&7]);
10341	0		per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index-2)&7], form.str - form_start - indices[(index-2)&7]);
10342	0		per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index-3)&7], form.str - form_start - indices[(index-3)&7]);
10343	0		per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index-4)&7], form.str - form_start - indices[(index-4)&7]);
10344	0		per_form[i].values[NUM] = elementary_feature_empty + 1 + num;
10345	0		per_form[i].values[CAP] = elementary_feature_empty + 1 + cap;
10346	0		per_form[i].values[DASH] = elementary_feature_empty + 1 + dash;
10347			}
10348			}
10349	0		}
10350
10351			template
10352			void czech_elementary_features::compute_dynamic_features(const tagged_lemma& tag, const per_form_features& /per_form/, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const {
10353	0	0	if (prev_dynamic) {
10354	0		dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG];
10355	0		dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA];
10356			} else {
10357	0		dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty;
10358	0		dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty;
10359			}
10360
10361	0	0	if (tag.tag[0] == 'V') {
10362	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG];
10363	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA];
10364			} else {
10365	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG];
10366	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA];
10367			}
10368			}
10369
10370			} // namespace morphodita
10371
10372			/////////
10373			// File: morphodita/tagger/generic_elementary_features.h
10374			/////////
10375
10376			// This file is part of MorphoDiTa .
10377			//
10378			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10379			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10380			//
10381			// This Source Code Form is subject to the terms of the Mozilla Public
10382			// License, v. 2.0. If a copy of the MPL was not distributed with this
10383			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10384
10385			namespace morphodita {
10386
10387			// Declarations
10388			template
10389	0		class generic_elementary_features : public elementary_features {
10390			public:
10391			generic_elementary_features();
10392
10393			enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL };
10394			enum features_per_tag { TAG, TAG1, TAG2, TAG3, TAG4, TAG5, LEMMA, PER_TAG_TOTAL };
10395			enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL };
10396			enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG1, MAP_TAG2, MAP_TAG3, MAP_TAG4, MAP_TAG5, MAP_LEMMA, MAP_TOTAL } ;
10397
10398			struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; };
10399			struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; };
10400			struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; };
10401
10402			static vector descriptions;
10403
10404			void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const;
10405			inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const;
10406
10407			using elementary_features::maps;
10408			};
10409
10410			typedef generic_elementary_features persistent_generic_elementary_features;
10411
10412			// Definitions
10413			template
10414	0		generic_elementary_features::generic_elementary_features() {
10415	0	0	maps.resize(MAP_TOTAL);
10416	0		}
10417
10418			template
10419			vector generic_elementary_features::descriptions = {
10420			{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM},
10421			{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG},
10422			{"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA },
10423			{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE},
10424			{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE},
10425			{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE},
10426			{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1},
10427			{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2},
10428			{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3},
10429			{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4},
10430			{"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5},
10431			{"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6},
10432			{"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7},
10433			{"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8},
10434			{"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9},
10435			{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1},
10436			{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2},
10437			{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3},
10438			{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4},
10439			{"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5},
10440			{"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6},
10441			{"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7},
10442			{"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8},
10443			{"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9},
10444
10445			{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG},
10446			{"Tag1", PER_TAG, ANY_OFFSET, TAG1, MAP_TAG1},
10447			{"Tag2", PER_TAG, ANY_OFFSET, TAG2, MAP_TAG2},
10448			{"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3},
10449			{"Tag4", PER_TAG, ANY_OFFSET, TAG4, MAP_TAG4},
10450			{"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5},
10451			{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA},
10452
10453			{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG},
10454			{"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA}
10455			};
10456
10457			template
10458	0		void generic_elementary_features::compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const {
10459			using namespace unilib;
10460
10461			// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly.
10462			elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty;
10463	0	0	for (unsigned i = forms.size(); i--;) {
10464			int verb_candidate = -1;
10465
10466			// Per_tag features and verb_candidate
10467	0	0	for (unsigned j = 0; j < analyses[i].size(); j++) {
10468	0		per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size());
10469	0	0	per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty;
10470	0	0	per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty;
10471	0	0	per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty;
10472	0	0	per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty;
10473	0	0	per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty;
10474	0	0	per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] :
		0
10475			maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size());
10476
10477	0	0	if (analyses[i][j].tag[0] == 'V') {
10478			int tag_compare;
10479	0	0	verb_candidate = verb_candidate < 0 \|\| (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) \|\| (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate;
		0
10480			}
10481			}
10482
10483			// Per_form features
10484	0		per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len);
10485	0		per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag;
10486	0		per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma;
10487
10488			// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}.
10489	0	0	if (verb_candidate >= 0) {
10490	0		following_verb_tag = per_tag[i][verb_candidate].values[TAG];
10491	0		following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA];
10492			}
10493
10494			// Ortographic per_form features if needed
10495	0	0	if (analyses[i].size() == 1) {
10496	0		per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown;
10497	0		per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown;
10498	0		per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown;
10499	0		per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown;
10500	0		per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown;
10501	0		per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown;
10502	0		per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown;
10503	0	0	} else if (forms[i].len <= 0) {
10504	0		per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1;
10505	0		per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty;
10506	0		per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty;
10507	0		per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty;
10508	0		per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty;
10509	0		per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty;
10510	0		per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty;
10511			} else {
10512	0		string_piece form = forms[i];
10513	0		const char* form_start = form.str;
10514
10515			bool num = false, cap = false, dash = false;
10516	0		size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters
10517			int index = 0;
10518	0	0	while (form.len) {
10519	0		indices[(index++) % 18] = form.str - form_start;
10520
10521	0		unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len));
10522	0	0	num = num \|\| cat & unicode::N;
		0
10523	0	0	cap = cap \|\| cat & unicode::Lut;
		0
10524	0	0	dash = dash \|\| cat & unicode::Pd;
		0
10525
10526	0	0	if (index == 10 \|\| (!form.len && index < 10)) {
		0
		0
10527	0		per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]);
10528	0		per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]);
10529	0		per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]);
10530	0		per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]);
10531	0		per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]);
10532	0		per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]);
10533	0		per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]);
10534	0		per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]);
10535	0		per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]);
10536			}
10537			}
10538	0		per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]);
10539	0		per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]);
10540	0		per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]);
10541	0		per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]);
10542	0		per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]);
10543	0		per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]);
10544	0		per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]);
10545	0		per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]);
10546	0		per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]);
10547	0		per_form[i].values[NUM] = elementary_feature_empty + 1 + num;
10548	0		per_form[i].values[CAP] = elementary_feature_empty + 1 + cap;
10549	0		per_form[i].values[DASH] = elementary_feature_empty + 1 + dash;
10550			}
10551			}
10552	0		}
10553
10554			template
10555			void generic_elementary_features::compute_dynamic_features(const tagged_lemma& tag, const per_form_features& /per_form/, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const {
10556	0	0	if (prev_dynamic) {
10557	0		dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG];
10558	0		dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA];
10559			} else {
10560	0		dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty;
10561	0		dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty;
10562			}
10563
10564	0	0	if (tag.tag[0] == 'V') {
10565	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG];
10566	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA];
10567			} else {
10568	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG];
10569	0		dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA];
10570			}
10571			}
10572
10573			} // namespace morphodita
10574
10575			/////////
10576			// File: morphodita/tagger/perceptron_tagger.h
10577			/////////
10578
10579			// This file is part of MorphoDiTa .
10580			//
10581			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10582			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10583			//
10584			// This Source Code Form is subject to the terms of the Mozilla Public
10585			// License, v. 2.0. If a copy of the MPL was not distributed with this
10586			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10587
10588			namespace morphodita {
10589
10590			// Declarations
10591			template
10592	4		class perceptron_tagger : public tagger {
10593			public:
10594			perceptron_tagger(int decoding_order, int window_size);
10595
10596			bool load(istream& is);
10597			virtual const morpho* get_morpho() const override;
10598			virtual void tag(const vector& forms, vector& tags, morpho::guesser_mode guesser = morpho::guesser_mode(-1)) const override;
10599			virtual void tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const override;
10600
10601			private:
10602			int decoding_order, window_size;
10603
10604			unique_ptr dict;
10605			bool use_guesser;
10606			FeatureSequences features;
10607			typedef viterbi viterbi_decoder;
10608			viterbi_decoder decoder;
10609	3		struct cache {
10610			vector forms;
10611			vector> analyses;
10612			vector tags;
10613			typename viterbi_decoder::cache decoder_cache;
10614
10615	1	0	cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {}
		0
		50
10616			};
10617
10618			mutable threadsafe_stack caches;
10619			};
10620
10621			// Definitions
10622
10623			template
10624	1		perceptron_tagger::perceptron_tagger(int decoding_order, int window_size)
10625	1		: decoding_order(decoding_order), window_size(window_size), decoder(features, decoding_order, window_size) {}
10626
10627			template
10628	1		bool perceptron_tagger::load(istream& is) {
10629	2	50	if (dict.reset(morpho::load(is)), !dict) return false;
		0
		0
10630	1		use_guesser = is.get();
10631	1	50	if (!features.load(is)) return false;
		0
		0
10632	1		return true;
10633			}
10634
10635			template
10636	1		const morpho* perceptron_tagger::get_morpho() const {
10637	1		return dict.get();
10638			}
10639
10640			template
10641	1		void perceptron_tagger::tag(const vector& forms, vector& tags, morpho::guesser_mode guesser) const {
10642			tags.clear();
10643	1	0	if (!dict) return;
		0
		50
10644
10645	1		cache* c = caches.pop();
10646	1	0	if (!c) c = new cache(*this);
		0
		0
		0
		50
		50
10647
10648	1		c->forms.resize(forms.size());
10649	1	0	if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size());
		0
		50
10650	8	0	for (unsigned i = 0; i < forms.size(); i++) {
		0
		100
10651	7		c->forms[i] = forms[i];
10652	7		c->forms[i].len = dict->raw_form_len(forms[i]);
10653	7	0	dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]);
		0
		0
		0
		50
		50
10654			}
10655
10656	1	0	if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2);
		0
		50
10657	1		decoder.tag(c->forms, c->analyses, c->decoder_cache, c->tags);
10658
10659	8	0	for (unsigned i = 0; i < forms.size(); i++)
		0
		100
10660	7		tags.emplace_back(c->analyses[i][c->tags[i]]);
10661
10662	1		caches.push(c);
10663			}
10664
10665			template
10666	0		void perceptron_tagger::tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const {
10667			tags.clear();
10668
10669	0		cache* c = caches.pop();
10670	0	0	if (!c) c = new cache(*this);
		0
		0
		0
		0
		0
10671
10672	0		tags.resize(forms.size());
10673	0		decoder.tag(forms, analyses, c->decoder_cache, tags);
10674
10675	0		caches.push(c);
10676	0		}
10677
10678			} // namespace morphodita
10679
10680			/////////
10681			// File: morphodita/tagger/tagger.cpp
10682			/////////
10683
10684			// This file is part of MorphoDiTa .
10685			//
10686			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10687			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10688			//
10689			// This Source Code Form is subject to the terms of the Mozilla Public
10690			// License, v. 2.0. If a copy of the MPL was not distributed with this
10691			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10692
10693			namespace morphodita {
10694
10695	1		tagger* tagger::load(istream& is) {
10696	1	50	tagger_id id = tagger_id(is.get());
		50
		50
		50
		0
		0
		0
10697			switch (id) {
10698			case tagger_ids::CZECH2:
10699			case tagger_ids::CZECH2_3:
10700			case tagger_ids::CZECH3:
10701			{
10702	0	0	auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10703	0	0	if (res->load(is)) return res.release();
		0
10704			break;
10705			}
10706			case tagger_ids::GENERIC2:
10707			case tagger_ids::GENERIC2_3:
10708			case tagger_ids::GENERIC3:
10709			case tagger_ids::GENERIC4:
10710			{
10711	0	0	auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10712	0	0	if (res->load(is)) return res.release();
		0
10713			break;
10714			}
10715			case tagger_ids::CONLLU2:
10716			case tagger_ids::CONLLU2_3:
10717			case tagger_ids::CONLLU3:
10718			{
10719	1	50	auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id));
10720	1	50	if (res->load(is)) return res.release();
		50
10721			break;
10722			}
10723			}
10724
10725			return nullptr;
10726			}
10727
10728	0		tagger* tagger::load(const char* fname) {
10729	0	0	ifstream f(path_from_utf8(fname).c_str(), ifstream::binary);
10730	0	0	if (!f) return nullptr;
10731
10732	0	0	return load(f);
10733			}
10734
10735	0		tokenizer* tagger::new_tokenizer() const {
10736	0		auto morpho = get_morpho();
10737	0	0	return morpho ? morpho->new_tokenizer() : nullptr;
10738			}
10739
10740			} // namespace morphodita
10741
10742			/////////
10743			// File: morphodita/tagset_converter/identity_tagset_converter.h
10744			/////////
10745
10746			// This file is part of MorphoDiTa .
10747			//
10748			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10749			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10750			//
10751			// This Source Code Form is subject to the terms of the Mozilla Public
10752			// License, v. 2.0. If a copy of the MPL was not distributed with this
10753			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10754
10755			namespace morphodita {
10756
10757	0		class identity_tagset_converter : public tagset_converter {
10758			public:
10759			virtual void convert(tagged_lemma& tagged_lemma) const override;
10760			virtual void convert_analyzed(vector& tagged_lemmas) const override;
10761			virtual void convert_generated(vector& forms) const override;
10762			};
10763
10764			} // namespace morphodita
10765
10766			/////////
10767			// File: morphodita/tagset_converter/identity_tagset_converter.cpp
10768			/////////
10769
10770			// This file is part of MorphoDiTa .
10771			//
10772			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10773			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10774			//
10775			// This Source Code Form is subject to the terms of the Mozilla Public
10776			// License, v. 2.0. If a copy of the MPL was not distributed with this
10777			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10778
10779			namespace morphodita {
10780
10781	0		void identity_tagset_converter::convert(tagged_lemma& /tagged_lemma/) const {}
10782
10783	0		void identity_tagset_converter::convert_analyzed(vector& /tagged_lemmas/) const {}
10784
10785	0		void identity_tagset_converter::convert_generated(vector& /forms/) const {}
10786
10787			} // namespace morphodita
10788
10789			/////////
10790			// File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.h
10791			/////////
10792
10793			// This file is part of MorphoDiTa .
10794			//
10795			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10796			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10797			//
10798			// This Source Code Form is subject to the terms of the Mozilla Public
10799			// License, v. 2.0. If a copy of the MPL was not distributed with this
10800			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10801
10802			namespace morphodita {
10803
10804	0		class pdt_to_conll2009_tagset_converter : public tagset_converter {
10805			public:
10806			virtual void convert(tagged_lemma& tagged_lemma) const override;
10807			virtual void convert_analyzed(vector& tagged_lemmas) const override;
10808			virtual void convert_generated(vector& forms) const override;
10809
10810			private:
10811			inline void convert_tag(const string& lemma, string& tag) const;
10812			inline bool convert_lemma(string& lemma) const;
10813			};
10814
10815			} // namespace morphodita
10816
10817			/////////
10818			// File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.cpp
10819			/////////
10820
10821			// This file is part of MorphoDiTa .
10822			//
10823			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10824			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10825			//
10826			// This Source Code Form is subject to the terms of the Mozilla Public
10827			// License, v. 2.0. If a copy of the MPL was not distributed with this
10828			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10829
10830			namespace morphodita {
10831
10832			static const char* names[15] = {"POS", "SubPOS", "Gen", "Num", "Cas", "PGe", "PNu", "Per", "Ten", "Gra", "Neg", "Voi", "", "", "Var"};
10833
10834	0		inline void pdt_to_conll2009_tagset_converter::convert_tag(const string& lemma, string& tag) const {
10835			char pdt_tag[16];
10836			strncpy(pdt_tag, tag.c_str(), 15);
10837
10838			// Clear the tag
10839			tag.clear();
10840
10841			// Fill FEAT of filled tag characters
10842	0	0	for (int i = 0; i < 15 && pdt_tag[i]; i++)
		0
10843	0	0	if (pdt_tag[i] != '-') {
10844	0	0	if (!tag.empty()) tag.push_back('\|');
10845	0		tag.append(names[i]);
10846	0		tag.push_back('=');
10847	0		tag.push_back(pdt_tag[i]);
10848			}
10849
10850			// Try adding Sem FEAT
10851	0	0	for (unsigned i = 0; i + 2 < lemma.size(); i++)
10852	0	0	if (lemma[i] == '_' && lemma[i + 1] == ';') {
		0
		0
10853	0	0	if (!tag.empty()) tag.push_back('\|');
10854	0		tag.append("Sem=");
10855	0		tag.push_back(lemma[i + 2]);
10856			break;
10857			}
10858	0		}
10859
10860	0		inline bool pdt_to_conll2009_tagset_converter::convert_lemma(string& lemma) const {
10861	0		unsigned raw_lemma = czech_lemma_addinfo::raw_lemma_len(lemma);
10862	0	0	return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false;
10863			}
10864
10865	0		void pdt_to_conll2009_tagset_converter::convert(tagged_lemma& tagged_lemma) const {
10866	0		convert_tag(tagged_lemma.lemma, tagged_lemma.tag);
10867	0		convert_lemma(tagged_lemma.lemma);
10868	0		}
10869
10870	0		void pdt_to_conll2009_tagset_converter::convert_analyzed(vector& tagged_lemmas) const {
10871			bool lemma_changed = false;
10872
10873	0	0	for (auto&& tagged_lemma : tagged_lemmas) {
10874	0		convert_tag(tagged_lemma.lemma, tagged_lemma.tag);
10875	0		lemma_changed \|= convert_lemma(tagged_lemma.lemma);
10876			}
10877
10878			// If no lemma was changed or there is 1 analysis, no duplicates could be created.
10879	0	0	if (!lemma_changed \|\| tagged_lemmas.size() < 2) return;
		0
		0
10880
10881	0		tagset_converter_unique_analyzed(tagged_lemmas);
10882			}
10883
10884	0		void pdt_to_conll2009_tagset_converter::convert_generated(vector& forms) const {
10885			bool lemma_changed = false;
10886
10887	0	0	for (auto&& tagged_lemma_forms : forms) {
10888	0	0	for (auto&& tagged_form : tagged_lemma_forms.forms)
10889	0		convert_tag(tagged_lemma_forms.lemma, tagged_form.tag);
10890	0		lemma_changed \|= convert_lemma(tagged_lemma_forms.lemma);
10891			}
10892
10893			// If no lemma was changed or there is 1 analysis, no duplicates could be created.
10894	0	0	if (!lemma_changed \|\| forms.size() < 2) return;
		0
		0
10895
10896	0		tagset_converter_unique_generated(forms);
10897			}
10898
10899			} // namespace morphodita
10900
10901			/////////
10902			// File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.h
10903			/////////
10904
10905			// This file is part of MorphoDiTa .
10906			//
10907			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10908			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10909			//
10910			// This Source Code Form is subject to the terms of the Mozilla Public
10911			// License, v. 2.0. If a copy of the MPL was not distributed with this
10912			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10913
10914			namespace morphodita {
10915
10916	0		class strip_lemma_comment_tagset_converter : public tagset_converter {
10917			public:
10918	0		strip_lemma_comment_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {}
10919
10920			virtual void convert(tagged_lemma& tagged_lemma) const override;
10921			virtual void convert_analyzed(vector& tagged_lemmas) const override;
10922			virtual void convert_generated(vector& forms) const override;
10923
10924			private:
10925			inline bool convert_lemma(string& lemma) const;
10926			const morpho& dictionary;
10927			};
10928
10929			} // namespace morphodita
10930
10931			/////////
10932			// File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.cpp
10933			/////////
10934
10935			// This file is part of MorphoDiTa .
10936			//
10937			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10938			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10939			//
10940			// This Source Code Form is subject to the terms of the Mozilla Public
10941			// License, v. 2.0. If a copy of the MPL was not distributed with this
10942			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10943
10944			namespace morphodita {
10945
10946	0		inline bool strip_lemma_comment_tagset_converter::convert_lemma(string& lemma) const {
10947	0		unsigned lemma_id_len = dictionary.lemma_id_len(lemma);
10948	0	0	return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false;
10949			}
10950
10951	0		void strip_lemma_comment_tagset_converter::convert(tagged_lemma& tagged_lemma) const {
10952	0		convert_lemma(tagged_lemma.lemma);
10953	0		}
10954
10955	0		void strip_lemma_comment_tagset_converter::convert_analyzed(vector& tagged_lemmas) const {
10956			bool lemma_changed = false;
10957
10958	0	0	for (auto&& tagged_lemma : tagged_lemmas)
10959	0		lemma_changed \|= convert_lemma(tagged_lemma.lemma);
10960
10961			// If no lemma was changed or there is 1 analysis, no duplicates could be created.
10962	0	0	if (!lemma_changed \|\| tagged_lemmas.size() < 2) return;
		0
		0
10963
10964	0		tagset_converter_unique_analyzed(tagged_lemmas);
10965			}
10966
10967	0		void strip_lemma_comment_tagset_converter::convert_generated(vector& forms) const {
10968			bool lemma_changed = false;
10969
10970	0	0	for (auto&& tagged_lemma_forms : forms)
10971	0		lemma_changed \|= convert_lemma(tagged_lemma_forms.lemma);
10972
10973			// If no lemma was changed or there is 1 analysis, no duplicates could be created.
10974	0	0	if (!lemma_changed \|\| forms.size() < 2) return;
		0
		0
10975
10976	0		tagset_converter_unique_generated(forms);
10977			}
10978
10979			} // namespace morphodita
10980
10981			/////////
10982			// File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.h
10983			/////////
10984
10985			// This file is part of MorphoDiTa .
10986			//
10987			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
10988			// Mathematics and Physics, Charles University in Prague, Czech Republic.
10989			//
10990			// This Source Code Form is subject to the terms of the Mozilla Public
10991			// License, v. 2.0. If a copy of the MPL was not distributed with this
10992			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
10993
10994			namespace morphodita {
10995
10996	0		class strip_lemma_id_tagset_converter : public tagset_converter {
10997			public:
10998	0		strip_lemma_id_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {}
10999
11000			virtual void convert(tagged_lemma& tagged_lemma) const override;
11001			virtual void convert_analyzed(vector& tagged_lemmas) const override;
11002			virtual void convert_generated(vector& forms) const override;
11003
11004			private:
11005			inline bool convert_lemma(string& lemma) const;
11006			const morpho& dictionary;
11007			};
11008
11009			} // namespace morphodita
11010
11011			/////////
11012			// File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.cpp
11013			/////////
11014
11015			// This file is part of MorphoDiTa .
11016			//
11017			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
11018			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11019			//
11020			// This Source Code Form is subject to the terms of the Mozilla Public
11021			// License, v. 2.0. If a copy of the MPL was not distributed with this
11022			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11023
11024			namespace morphodita {
11025
11026	0		inline bool strip_lemma_id_tagset_converter::convert_lemma(string& lemma) const {
11027	0		unsigned raw_lemma_len = dictionary.raw_lemma_len(lemma);
11028	0	0	return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false;
11029			}
11030
11031	0		void strip_lemma_id_tagset_converter::convert(tagged_lemma& tagged_lemma) const {
11032	0		convert_lemma(tagged_lemma.lemma);
11033	0		}
11034
11035	0		void strip_lemma_id_tagset_converter::convert_analyzed(vector& tagged_lemmas) const {
11036			bool lemma_changed = false;
11037
11038	0	0	for (auto&& tagged_lemma : tagged_lemmas)
11039	0		lemma_changed \|= convert_lemma(tagged_lemma.lemma);
11040
11041			// If no lemma was changed or there is 1 analysis, no duplicates could be created.
11042	0	0	if (!lemma_changed \|\| tagged_lemmas.size() < 2) return;
		0
		0
11043
11044	0		tagset_converter_unique_analyzed(tagged_lemmas);
11045			}
11046
11047	0		void strip_lemma_id_tagset_converter::convert_generated(vector& forms) const {
11048			bool lemma_changed = false;
11049
11050	0	0	for (auto&& tagged_lemma_forms : forms)
11051	0		lemma_changed \|= convert_lemma(tagged_lemma_forms.lemma);
11052
11053			// If no lemma was changed or there is 1 analysis, no duplicates could be created.
11054	0	0	if (!lemma_changed \|\| forms.size() < 2) return;
		0
		0
11055
11056	0		tagset_converter_unique_generated(forms);
11057			}
11058
11059			} // namespace morphodita
11060
11061			/////////
11062			// File: morphodita/tagset_converter/tagset_converter.cpp
11063			/////////
11064
11065			// This file is part of MorphoDiTa .
11066			//
11067			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
11068			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11069			//
11070			// This Source Code Form is subject to the terms of the Mozilla Public
11071			// License, v. 2.0. If a copy of the MPL was not distributed with this
11072			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11073
11074			namespace morphodita {
11075
11076	0		tagset_converter* tagset_converter::new_identity_converter() {
11077	0		return new identity_tagset_converter();
11078			}
11079
11080	0		tagset_converter* tagset_converter::new_pdt_to_conll2009_converter() {
11081	0		return new pdt_to_conll2009_tagset_converter();
11082			}
11083
11084	0		tagset_converter* tagset_converter::new_strip_lemma_comment_converter(const morpho& dictionary) {
11085	0		return new strip_lemma_comment_tagset_converter(dictionary);
11086			}
11087
11088	0		tagset_converter* tagset_converter::new_strip_lemma_id_converter(const morpho& dictionary) {
11089	0		return new strip_lemma_id_tagset_converter(dictionary);
11090			}
11091
11092	0		tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary) {
11093	0	0	if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter();
11094	0	0	if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary);
11095	0	0	if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary);
11096			return nullptr;
11097			}
11098
11099	0		void tagset_converter_unique_analyzed(vector& tagged_lemmas) {
11100			// Remove possible lemma-tag pair duplicates
11101			struct tagged_lemma_comparator {
11102	0	0	inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; }
		0
11103	0	0	inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 \|\| (lemma_compare == 0 && a.tag < b.tag); }
11104			};
11105
11106			sort(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::lt);
11107	0		tagged_lemmas.resize(unique(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::eq) - tagged_lemmas.begin());
11108	0		}
11109
11110	0		void tagset_converter_unique_generated(vector& forms) {
11111			// Regroup and if needed remove duplicate form-tag pairs for each lemma
11112	0	0	for (unsigned i = 0; i < forms.size(); i++) {
11113			bool any_merged = false;
11114	0	0	for (unsigned j = forms.size() - 1; j > i; j--)
11115	0	0	if (forms[j].lemma == forms[i].lemma) {
11116			// Same lemma was found. Merge form-tag pairs
11117	0	0	for (auto&& tagged_form : forms[j].forms)
11118	0		forms[i].forms.emplace_back(move(tagged_form));
11119
11120			// Remove lemma j by moving it to end and deleting
11121	0	0	if (j < forms.size() - 1) {
11122	0		forms[j].lemma.swap(forms[forms.size() - 1].lemma);
11123	0		forms[j].forms.swap(forms[forms.size() - 1].forms);
11124			}
11125			forms.pop_back();
11126			any_merged = true;
11127			}
11128
11129	0	0	if (any_merged && forms[i].forms.size() > 1) {
		0
		0
11130			// Remove duplicate form-tag pairs
11131			struct tagged_form_comparator {
11132	0	0	inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; }
		0
11133	0	0	inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 \|\| (tag_compare == 0 && a.form < b.form); }
11134			};
11135
11136			sort(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::lt);
11137	0		forms[i].forms.resize(unique(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::eq) - forms[i].forms.begin());
11138			}
11139			}
11140	0		}
11141
11142			} // namespace morphodita
11143
11144			/////////
11145			// File: morphodita/tokenizer/czech_tokenizer.cpp
11146			/////////
11147
11148			// This file is part of MorphoDiTa .
11149			//
11150			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
11151			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11152			//
11153			// This Source Code Form is subject to the terms of the Mozilla Public
11154			// License, v. 2.0. If a copy of the MPL was not distributed with this
11155			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11156
11157			namespace morphodita {
11158
11159			static const char _czech_tokenizer_cond_offsets[] = {
11160			0, 0, 0, 0, 0, 0, 0, 0,
11161			2, 2, 2, 2, 2, 2, 2, 2,
11162			2, 2, 2, 2, 2, 2, 2
11163			};
11164
11165			static const char _czech_tokenizer_cond_lengths[] = {
11166			0, 0, 0, 0, 0, 0, 0, 2,
11167			0, 0, 0, 0, 0, 0, 0, 0,
11168			0, 0, 0, 0, 0, 0, 0
11169			};
11170
11171			static const short _czech_tokenizer_cond_keys[] = {
11172			43u, 43u, 45u, 45u, 0
11173			};
11174
11175			static const char _czech_tokenizer_cond_spaces[] = {
11176			1, 0, 0
11177			};
11178
11179			static const unsigned char _czech_tokenizer_key_offsets[] = {
11180			0, 0, 17, 29, 43, 46, 51, 54,
11181			89, 94, 98, 101, 105, 110, 111, 116,
11182			117, 122, 136, 143, 148, 151, 163
11183			};
11184
11185			static const short _czech_tokenizer_trans_keys[] = {
11186			13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u,
11187			133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u,
11188			90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u,
11189			135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u,
11190			39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u,
11191			161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u,
11192			159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u,
11193			13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u,
11194			131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u,
11195			557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u,
11196			64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u,
11197			255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u,
11198			32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u,
11199			147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u,
11200			10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u,
11201			32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u,
11202			93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u,
11203			44u, 46u, 69u, 101u, 159u, 48u, 57u, 69u,
11204			101u, 159u, 48u, 57u, 159u, 48u, 57u, 129u,
11205			131u, 135u, 151u, 155u, 157u, 65u, 90u, 97u,
11206			122u, 142u, 143u, 159u, 48u, 57u, 0
11207			};
11208
11209			static const char _czech_tokenizer_single_lengths[] = {
11210			0, 13, 10, 12, 1, 3, 1, 21,
11211			5, 4, 3, 4, 5, 1, 5, 1,
11212			5, 12, 5, 3, 1, 6, 1
11213			};
11214
11215			static const char _czech_tokenizer_range_lengths[] = {
11216			0, 2, 1, 1, 1, 1, 1, 7,
11217			0, 0, 0, 0, 0, 0, 0, 0,
11218			0, 1, 1, 1, 1, 3, 1
11219			};
11220
11221			static const unsigned char _czech_tokenizer_index_offsets[] = {
11222			0, 0, 16, 28, 42, 45, 50, 53,
11223			82, 88, 93, 97, 102, 108, 110, 116,
11224			118, 124, 138, 145, 150, 153, 163
11225			};
11226
11227			static const char _czech_tokenizer_indicies[] = {
11228			1, 1, 2, 2, 2, 2, 2, 3,
11229			2, 3, 1, 2, 2, 1, 3, 0,
11230			2, 2, 2, 2, 2, 3, 2, 3,
11231			2, 2, 3, 0, 4, 4, 5, 5,
11232			5, 5, 5, 5, 5, 5, 4, 5,
11233			4, 0, 6, 6, 0, 7, 7, 8,
11234			8, 0, 8, 8, 0, 10, 11, 12,
11235			10, 13, 9, 13, 9, 13, 16, 16,
11236			16, 16, 10, 16, 15, 13, 9, 17,
11237			9, 17, 9, 15, 9, 16, 9, 16,
11238			9, 14, 10, 19, 20, 10, 10, 18,
11239			10, 21, 10, 10, 18, 10, 10, 10,
11240			18, 10, 21, 10, 10, 18, 10, 22,
11241			23, 10, 10, 18, 25, 24, 10, 22,
11242			26, 10, 10, 18, 25, 24, 10, 23,
11243			26, 10, 10, 18, 4, 4, 5, 5,
11244			5, 5, 5, 5, 5, 5, 4, 5,
11245			4, 27, 28, 28, 29, 29, 15, 15,
11246			27, 29, 29, 6, 6, 27, 8, 8,
11247			27, 16, 16, 16, 16, 16, 16, 16,
11248			16, 16, 27, 15, 15, 27, 0
11249			};
11250
11251			static const char _czech_tokenizer_trans_targs[] = {
11252			7, 1, 2, 7, 1, 3, 19, 6,
11253			20, 7, 8, 12, 16, 17, 0, 18,
11254			21, 22, 7, 9, 11, 10, 13, 14,
11255			7, 7, 15, 7, 4, 5
11256			};
11257
11258			static const char _czech_tokenizer_trans_actions[] = {
11259			1, 0, 0, 2, 3, 0, 4, 0,
11260			0, 7, 0, 0, 0, 4, 0, 4,
11261			0, 0, 8, 0, 0, 0, 0, 0,
11262			9, 10, 0, 11, 0, 0
11263			};
11264
11265			static const char _czech_tokenizer_to_state_actions[] = {
11266			0, 0, 0, 0, 0, 0, 0, 5,
11267			0, 0, 0, 0, 0, 0, 0, 0,
11268			0, 0, 0, 0, 0, 0, 0
11269			};
11270
11271			static const char _czech_tokenizer_from_state_actions[] = {
11272			0, 0, 0, 0, 0, 0, 0, 6,
11273			0, 0, 0, 0, 0, 0, 0, 0,
11274			0, 0, 0, 0, 0, 0, 0
11275			};
11276
11277			static const unsigned char _czech_tokenizer_eof_trans[] = {
11278			0, 1, 1, 1, 1, 1, 1, 0,
11279			19, 19, 19, 19, 19, 25, 19, 25,
11280			19, 28, 28, 28, 28, 28, 28
11281			};
11282
11283			static const int czech_tokenizer_start = 7;
11284
11285			// The list of lower cased words that when preceding eos do not end sentence.
11286			// Note: because of VS, we cannot list the abbreviations directly in UTF-8,
11287			// because the compilation of utf-8 encoded sources fail on some locales
11288			// (e.g., Japanese).
11289			// perl -CS -ple 'use Encode;s/([^[:ascii:]])/join("", map {sprintf "\\%o", ord($_)} split(m@@, encode("utf-8", $1)))/ge'
11290			// perl -CS -ple 'use Encode;s/\\([0-7]{3})\\([0-7]{3})/decode("utf-8", chr(oct($1)).chr(oct($2)))/ge'
11291	218	100	const unordered_set czech_tokenizer::abbreviations_czech = {
		0
11292			// Titles
11293			"prof", "csc", "drsc", "doc", "phd", "ph", "d",
11294			"judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr",
11295			"ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga",
11296			"gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s",
11297			"p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv",
11298			// Geographic names
11299			"angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "n\304\233m", "nem", "it", "pol", "ma\304\217", "mad", "rus",
11300			"sev", "v\303\275ch", "vych", "ji\305\276", "jiz", "z\303\241p", "zap",
11301			// Common abbrevs
11302			"adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "nap\305\231", "napr",
11303			"okr", "pop\305\231", "popr", "pozn", "r", "\305\231", "red", "rep", "resp", "srov", "st", "st\305\231", "str",
11304			"sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn",
11305			};
11306
11307	210	100	const unordered_set czech_tokenizer::abbreviations_slovak = {
		0
11308			// Titles
11309			"prof", "csc", "drsc", "doc", "phd", "ph", "d",
11310			"judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr",
11311			"ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga",
11312			"gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s",
11313			"p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv",
11314			// Geographic names
11315			"angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "nem", "it", "po\304\276", "pol", "ma\304\217", "mad",
11316			"rus", "sev", "v\303\275ch", "vych", "ju\305\276", "juz", "z\303\241p", "zap",
11317			// Common abbrevs
11318			"adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "napr",
11319			"okr", "popr", "pozn", "r", "red", "rep", "resp", "srov", "st", "str",
11320			"sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn",
11321			};
11322
11323	0		czech_tokenizer::czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m)
11324	0	0	: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) {
		0
11325	0		switch (language) {
11326			case CZECH:
11327	0		abbreviations = &abbreviations_czech;
11328	0		break;
11329			case SLOVAK:
11330	0		abbreviations = &abbreviations_slovak;
11331	0		break;
11332			}
11333	0		}
11334
11335	0		void czech_tokenizer::merge_hyphenated(vector& tokens) {
11336			using namespace unilib;
11337
11338	0	0	if (!m) return;
11339	0	0	if (tokens.empty() \|\| chars[tokens.back().start].cat & ~unicode::L) return;
		0
		0
11340
11341			unsigned matched_hyphens = 0;
11342	0	0	for (unsigned hyphens = 1; hyphens <= 2; hyphens++) {
11343			// Are the tokens a sequence of 'hyphens' hyphenated tokens?
11344	0	0	if (tokens.size() < 2*hyphens + 1) break;
11345	0		unsigned first_hyphen = tokens.size() - 2*hyphens;
11346	0	0	if (tokens[first_hyphen].length != 1 \|\| chars[tokens[first_hyphen].start].cat & ~unicode::P \|\|
		0
		0
11347	0	0	tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start \|\|
11348	0	0	tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start \|\|
		0
11349	0		chars[tokens[first_hyphen-1].start].cat & ~unicode::L)
11350			break;
11351
11352	0	0	if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0)
11353			matched_hyphens = hyphens;
11354			}
11355
11356	0	0	if (matched_hyphens) {
11357	0		unsigned first = tokens.size() - 2*matched_hyphens - 1;
11358	0		tokens[first].length = tokens.back().start + tokens.back().length - tokens[first].start;
11359	0		tokens.resize(first + 1);
11360			}
11361			}
11362
11363	0		bool czech_tokenizer::next_sentence(vector& tokens) {
11364			using namespace unilib;
11365
11366			int cs, act;
11367			size_t ts, te;
11368			size_t whitespace = 0; // Suppress "may be uninitialized" warning
11369
11370	0	0	while (tokenize_url_email(tokens))
11371	0	0	if (emergency_sentence_split(tokens))
11372			return true;
11373
11374			{
11375			cs = czech_tokenizer_start;
11376	0		ts = 0;
11377			te = 0;
11378			act = 0;
11379			}
11380
11381			{
11382			int _klen;
11383			const short *_keys;
11384			int _trans;
11385			short _widec;
11386
11387	0	0	if ( ( current) == ( (chars.size() - 1)) )
11388			goto _test_eof;
11389			if ( cs == 0 )
11390			goto _out;
11391			_resume:
11392	0	0	switch ( _czech_tokenizer_from_state_actions[cs] ) {
11393			case 6:
11394	0		{ts = ( current);}
11395	0		break;
11396			}
11397
11398	0		_widec = ( ragel_char(chars[current]));
11399	0		_klen = _czech_tokenizer_cond_lengths[cs];
11400	0		_keys = _czech_tokenizer_cond_keys + (_czech_tokenizer_cond_offsets[cs]*2);
11401	0	0	if ( _klen > 0 ) {
11402			const short *_lower = _keys;
11403			const short *_mid;
11404	0		const short *_upper = _keys + (_klen<<1) - 2;
11405			while (1) {
11406	0	0	if ( _upper < _lower )
11407			break;
11408
11409	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11410	0	0	if ( _widec < _mid[0] )
11411	0		_upper = _mid - 2;
11412	0	0	else if ( _widec > _mid[1] )
11413	0		_lower = _mid + 2;
11414			else {
11415	0		switch ( _czech_tokenizer_cond_spaces[_czech_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
11416			case 0: {
11417	0		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
11418	0	0	if (
11419	0	0	!current \|\| (chars[current-1].cat & ~(unicode::L \| unicode::M \| unicode::N \| unicode::Pd)) ) _widec += 256;
		0
11420			break;
11421			}
11422			case 1: {
11423	0		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
11424	0	0	if (
11425	0	0	!current \|\| ((chars[current-1].cat & ~(unicode::L \| unicode::M \| unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
		0
		0
11426			break;
11427			}
11428			}
11429			break;
11430			}
11431			}
11432			}
11433
11434	0		_keys = _czech_tokenizer_trans_keys + _czech_tokenizer_key_offsets[cs];
11435	0		_trans = _czech_tokenizer_index_offsets[cs];
11436
11437	0		_klen = _czech_tokenizer_single_lengths[cs];
11438	0	0	if ( _klen > 0 ) {
11439			const short *_lower = _keys;
11440			const short *_mid;
11441	0		const short *_upper = _keys + _klen - 1;
11442			while (1) {
11443	0	0	if ( _upper < _lower )
11444			break;
11445
11446	0		_mid = _lower + ((_upper-_lower) >> 1);
11447	0	0	if ( _widec < *_mid )
11448	0		_upper = _mid - 1;
11449	0	0	else if ( _widec > *_mid )
11450	0		_lower = _mid + 1;
11451			else {
11452	0		_trans += (unsigned int)(_mid - _keys);
11453	0		goto _match;
11454			}
11455			}
11456	0		_keys += _klen;
11457	0		_trans += _klen;
11458			}
11459
11460	0		_klen = _czech_tokenizer_range_lengths[cs];
11461	0	0	if ( _klen > 0 ) {
11462			const short *_lower = _keys;
11463			const short *_mid;
11464	0		const short *_upper = _keys + (_klen<<1) - 2;
11465			while (1) {
11466	0	0	if ( _upper < _lower )
11467			break;
11468
11469	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11470	0	0	if ( _widec < _mid[0] )
11471	0		_upper = _mid - 2;
11472	0	0	else if ( _widec > _mid[1] )
11473	0		_lower = _mid + 2;
11474			else {
11475	0		_trans += (unsigned int)((_mid - _keys)>>1);
11476	0		goto _match;
11477			}
11478			}
11479	0		_trans += _klen;
11480			}
11481
11482			_match:
11483	0		_trans = _czech_tokenizer_indicies[_trans];
11484			_eof_trans:
11485	0		cs = _czech_tokenizer_trans_targs[_trans];
11486
11487	0	0	if ( _czech_tokenizer_trans_actions[_trans] == 0 )
11488			goto _again;
11489
11490	0		switch ( _czech_tokenizer_trans_actions[_trans] ) {
11491			case 3:
11492	0		{ whitespace = current; }
11493	0		break;
11494			case 4:
11495	0		{te = ( current)+1;}
11496	0		break;
11497			case 7:
11498	0		{te = ( current)+1;{ tokens.emplace_back(ts, te - ts);
11499	0		merge_hyphenated(tokens);
11500	0		current = te;
11501	0	0	do
11502	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11503			while (tokenize_url_email(tokens));
11504	0		( current)--;
11505			}}
11506	0		break;
11507			case 2:
11508	0		{te = ( current)+1;{
11509	0		bool eos = is_eos(tokens, chars[ts].chr, abbreviations);
11510	0	0	for (current = ts; current < whitespace; current++)
11511	0		tokens.emplace_back(current, 1);
11512	0		{( current) = (( whitespace))-1;}
11513	0	0	if (eos) {( current)++; goto _out; }
11514			}}
11515			break;
11516			case 10:
11517	0		{te = ( current)+1;{
11518	0	0	if (!tokens.empty()) {( current)++; goto _out; }
11519	0		current = te;
11520	0	0	do
11521	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11522			while (tokenize_url_email(tokens));
11523	0		( current)--;
11524			}}
11525	0		break;
11526			case 11:
11527	0		{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts);
11528	0		merge_hyphenated(tokens);
11529	0		current = te;
11530	0	0	do
11531	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11532			while (tokenize_url_email(tokens));
11533	0		( current)--;
11534			}}
11535	0		break;
11536			case 8:
11537	0		{te = ( current);( current)--;{
11538	0		current = te;
11539	0	0	do
11540	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11541			while (tokenize_url_email(tokens));
11542	0		( current)--;
11543			}}
11544	0		break;
11545			case 9:
11546	0		{te = ( current);( current)--;{
11547	0	0	if (!tokens.empty()) {( current)++; goto _out; }
11548	0		current = te;
11549	0	0	do
11550	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11551			while (tokenize_url_email(tokens));
11552	0		( current)--;
11553			}}
11554	0		break;
11555			case 1:
11556	0		{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts);
11557	0		merge_hyphenated(tokens);
11558	0		current = te;
11559	0	0	do
11560	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
11561			while (tokenize_url_email(tokens));
11562	0		( current)--;
11563			}}
11564	0		break;
11565			}
11566
11567			_again:
11568	0	0	switch ( _czech_tokenizer_to_state_actions[cs] ) {
11569			case 5:
11570	0		{ts = 0;}
11571	0		break;
11572			}
11573
11574	0	0	if ( cs == 0 )
11575			goto _out;
11576	0	0	if ( ++( current) != ( (chars.size() - 1)) )
11577			goto _resume;
11578			_test_eof: {}
11579	0	0	if ( ( current) == ( (chars.size() - 1)) )
11580			{
11581	0	0	if ( _czech_tokenizer_eof_trans[cs] > 0 ) {
11582	0		_trans = _czech_tokenizer_eof_trans[cs] - 1;
11583	0		goto _eof_trans;
11584			}
11585			}
11586
11587			_out: {}
11588			}
11589
11590			(void)act; // Suppress unused variable warning
11591
11592	0		return !tokens.empty();
11593			}
11594
11595			} // namespace morphodita
11596
11597			/////////
11598			// File: morphodita/tokenizer/czech_tokenizer_factory.h
11599			/////////
11600
11601			// This file is part of MorphoDiTa .
11602			//
11603			// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of
11604			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11605			//
11606			// This Source Code Form is subject to the terms of the Mozilla Public
11607			// License, v. 2.0. If a copy of the MPL was not distributed with this
11608			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11609
11610			namespace morphodita {
11611
11612	0		class czech_tokenizer_factory : public tokenizer_factory {
11613			public:
11614			// Construct a new tokenizer instance.
11615			virtual tokenizer* new_tokenizer(const morpho* m) const override;
11616
11617			bool load(istream& is);
11618			private:
11619			czech_tokenizer::tokenizer_language language;
11620			unsigned version;
11621			};
11622
11623			} // namespace morphodita
11624
11625			/////////
11626			// File: morphodita/tokenizer/czech_tokenizer_factory.cpp
11627			/////////
11628
11629			// This file is part of MorphoDiTa .
11630			//
11631			// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of
11632			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11633			//
11634			// This Source Code Form is subject to the terms of the Mozilla Public
11635			// License, v. 2.0. If a copy of the MPL was not distributed with this
11636			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11637
11638			namespace morphodita {
11639
11640	0		tokenizer* czech_tokenizer_factory::new_tokenizer(const morpho* m) const {
11641	0	0	return new czech_tokenizer(language, version, m);
11642			}
11643
11644	0		bool czech_tokenizer_factory::load(istream& is) {
11645	0		language = czech_tokenizer::tokenizer_language(is.get());
11646	0		version = is.get();
11647
11648	0	0	return bool(is) && (language == czech_tokenizer::CZECH \|\| language == czech_tokenizer::SLOVAK);
		0
11649			}
11650
11651			} // namespace morphodita
11652
11653			/////////
11654			// File: morphodita/tokenizer/czech_tokenizer_factory_encoder.h
11655			/////////
11656
11657			// This file is part of MorphoDiTa .
11658			//
11659			// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of
11660			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11661			//
11662			// This Source Code Form is subject to the terms of the Mozilla Public
11663			// License, v. 2.0. If a copy of the MPL was not distributed with this
11664			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11665
11666			namespace morphodita {
11667
11668			class czech_tokenizer_factory_encoder {
11669			public:
11670			static void encode(czech_tokenizer::tokenizer_language language, unsigned version, ostream& os);
11671			};
11672
11673			} // namespace morphodita
11674
11675			/////////
11676			// File: morphodita/tokenizer/czech_tokenizer_factory_encoder.cpp
11677			/////////
11678
11679			// This file is part of MorphoDiTa .
11680			//
11681			// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of
11682			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11683			//
11684			// This Source Code Form is subject to the terms of the Mozilla Public
11685			// License, v. 2.0. If a copy of the MPL was not distributed with this
11686			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11687
11688			namespace morphodita {
11689
11690	0		void czech_tokenizer_factory_encoder::encode(czech_tokenizer::tokenizer_language language, unsigned version, ostream& os) {
11691	0		os.put(language);
11692	0		os.put(version);
11693	0		}
11694
11695			} // namespace morphodita
11696
11697			/////////
11698			// File: morphodita/tokenizer/english_tokenizer.cpp
11699			/////////
11700
11701			// This file is part of MorphoDiTa .
11702			//
11703			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
11704			// Mathematics and Physics, Charles University in Prague, Czech Republic.
11705			//
11706			// This Source Code Form is subject to the terms of the Mozilla Public
11707			// License, v. 2.0. If a copy of the MPL was not distributed with this
11708			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
11709
11710			namespace morphodita {
11711
11712			// The list of lowercased words that when preceding eos do not end sentence.
11713	232	100	const unordered_set english_tokenizer::abbreviations = {
		0
11714			// Titles
11715			"adj", "adm", "adv", "assoc", "asst", "bart", "bldg", "brig", "bros", "capt",
11716			"cmdr", "col", "comdr", "con", "corp", "cpl", "d", "dr", "dr", "drs", "ens",
11717			"gen", "gov", "hon", "hosp", "hr", "insp", "lt", "mm", "mr", "mrs", "ms",
11718			"maj", "messrs", "mlle", "mme", "mr", "mrs", "ms", "msgr", "op", "ord",
11719			"pfc", "ph", "phd", "prof", "pvt", "rep", "reps", "res", "rev", "rt", "sen",
11720			"sens", "sfc", "sgt", "sr", "st", "supt", "surg", "univ",
11721			// Common abbrevs
11722			"addr", "approx", "apr", "aug", "calif", "co", "corp", "dec", "def", "e",
11723			"e.g", "eg", "feb", "fla", "ft", "gen", "gov", "hrs", "i.", "i.e", "ie",
11724			"inc", "jan", "jr", "ltd", "mar", "max", "min", "mph", "mt", "n", "nov",
11725			"oct", "ont", "pa", "pres", "rep", "rev", "s", "sec", "sen", "sep", "sept",
11726			"sgt", "sr", "tel", "un", "univ", "v", "va", "vs", "w", "yrs",
11727			};
11728
11729			static const char _english_tokenizer_split_token_key_offsets[] = {
11730			0, 0, 16, 20, 22, 26, 28, 30,
11731			32, 34, 36, 44, 46, 50, 52, 54,
11732			56, 58, 60, 62, 64, 66, 68, 72,
11733			74, 76, 78, 80, 82, 82
11734			};
11735
11736			static const unsigned char _english_tokenizer_split_token_trans_keys[] = {
11737			65u, 68u, 69u, 76u, 77u, 78u, 83u, 84u,
11738			97u, 100u, 101u, 108u, 109u, 110u, 115u, 116u,
11739			78u, 84u, 110u, 116u, 78u, 110u, 65u, 79u,
11740			97u, 111u, 87u, 119u, 71u, 103u, 84u, 116u,
11741			79u, 111u, 39u, 161u, 77u, 82u, 86u, 89u,
11742			109u, 114u, 118u, 121u, 77u, 109u, 69u, 73u,
11743			101u, 105u, 76u, 108u, 39u, 161u, 68u, 100u,
11744			76u, 108u, 39u, 161u, 69u, 101u, 82u, 114u,
11745			79u, 111u, 77u, 109u, 39u, 79u, 111u, 161u,
11746			78u, 110u, 78u, 110u, 78u, 110u, 65u, 97u,
11747			67u, 99u, 0
11748			};
11749
11750			static const char _english_tokenizer_split_token_single_lengths[] = {
11751			0, 16, 4, 2, 4, 2, 2, 2,
11752			2, 2, 8, 2, 4, 2, 2, 2,
11753			2, 2, 2, 2, 2, 2, 4, 2,
11754			2, 2, 2, 2, 0, 0
11755			};
11756
11757			static const char _english_tokenizer_split_token_range_lengths[] = {
11758			0, 0, 0, 0, 0, 0, 0, 0,
11759			0, 0, 0, 0, 0, 0, 0, 0,
11760			0, 0, 0, 0, 0, 0, 0, 0,
11761			0, 0, 0, 0, 0, 0
11762			};
11763
11764			static const unsigned char _english_tokenizer_split_token_index_offsets[] = {
11765			0, 0, 17, 22, 25, 30, 33, 36,
11766			39, 42, 45, 54, 57, 62, 65, 68,
11767			71, 74, 77, 80, 83, 86, 89, 94,
11768			97, 100, 103, 106, 109, 110
11769			};
11770
11771			static const char _english_tokenizer_split_token_indicies[] = {
11772			0, 2, 3, 4, 2, 5, 2, 6,
11773			0, 2, 3, 4, 2, 5, 2, 6,
11774			1, 7, 8, 7, 8, 1, 9, 9,
11775			1, 10, 11, 10, 11, 1, 12, 12,
11776			1, 12, 12, 1, 13, 13, 1, 11,
11777			11, 1, 14, 14, 1, 15, 2, 2,
11778			16, 15, 2, 2, 16, 1, 17, 17,
11779			1, 18, 11, 18, 11, 1, 12, 12,
11780			1, 19, 19, 1, 12, 12, 1, 2,
11781			2, 1, 20, 20, 1, 21, 21, 1,
11782			22, 22, 1, 23, 23, 1, 12, 12,
11783			1, 24, 25, 25, 24, 1, 14, 14,
11784			1, 26, 26, 1, 27, 27, 1, 28,
11785			28, 1, 12, 12, 1, 1, 1, 0
11786			};
11787
11788			static const char _english_tokenizer_split_token_trans_targs[] = {
11789			2, 0, 9, 10, 16, 17, 22, 3,
11790			7, 4, 5, 6, 28, 8, 29, 11,
11791			14, 12, 13, 15, 18, 19, 20, 21,
11792			23, 24, 25, 26, 27
11793			};
11794
11795			static const char _english_tokenizer_split_token_trans_actions[] = {
11796			0, 0, 0, 0, 0, 0, 0, 1,
11797			1, 0, 0, 0, 0, 0, 2, 1,
11798			1, 0, 0, 0, 1, 0, 0, 0,
11799			0, 0, 1, 0, 0
11800			};
11801
11802			static const char _english_tokenizer_split_token_eof_actions[] = {
11803			0, 0, 0, 0, 0, 0, 0, 0,
11804			0, 0, 0, 0, 0, 0, 0, 0,
11805			0, 0, 0, 0, 0, 0, 0, 0,
11806			0, 0, 0, 0, 3, 0
11807			};
11808
11809			static const int english_tokenizer_split_token_start = 1;
11810
11811	0		void english_tokenizer::split_token(vector& tokens) {
11812	0	0	if (tokens.empty() \|\| chars[tokens.back().start].cat & ~unilib::unicode::L) return;
		0
		0
11813
11814	0		size_t index = tokens.back().start, end = index + tokens.back().length;
11815			int cs;
11816	0		size_t split_mark = 0, split_len = 0;
11817
11818			{
11819			cs = english_tokenizer_split_token_start;
11820			}
11821
11822			{
11823			int _klen;
11824			const unsigned char *_keys;
11825			int _trans;
11826
11827	0	0	if ( ( index) == ( end) )
11828			goto _test_eof;
11829			if ( cs == 0 )
11830			goto _out;
11831			_resume:
11832	0		_keys = _english_tokenizer_split_token_trans_keys + _english_tokenizer_split_token_key_offsets[cs];
11833	0		_trans = _english_tokenizer_split_token_index_offsets[cs];
11834
11835	0		_klen = _english_tokenizer_split_token_single_lengths[cs];
11836	0	0	if ( _klen > 0 ) {
11837			const unsigned char *_lower = _keys;
11838			const unsigned char *_mid;
11839	0		const unsigned char *_upper = _keys + _klen - 1;
11840			while (1) {
11841	0	0	if ( _upper < _lower )
11842			break;
11843
11844	0		_mid = _lower + ((_upper-_lower) >> 1);
11845	0	0	if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid )
11846	0		_upper = _mid - 1;
11847	0	0	else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid )
11848	0		_lower = _mid + 1;
11849			else {
11850	0		_trans += (unsigned int)(_mid - _keys);
11851	0		goto _match;
11852			}
11853			}
11854	0		_keys += _klen;
11855	0		_trans += _klen;
11856			}
11857
11858	0		_klen = _english_tokenizer_split_token_range_lengths[cs];
11859	0	0	if ( _klen > 0 ) {
11860			const unsigned char *_lower = _keys;
11861			const unsigned char *_mid;
11862	0		const unsigned char *_upper = _keys + (_klen<<1) - 2;
11863			while (1) {
11864	0	0	if ( _upper < _lower )
11865			break;
11866
11867	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
11868	0	0	if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] )
11869	0		_upper = _mid - 2;
11870	0	0	else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] )
11871	0		_lower = _mid + 2;
11872			else {
11873	0		_trans += (unsigned int)((_mid - _keys)>>1);
11874	0		goto _match;
11875			}
11876			}
11877	0		_trans += _klen;
11878			}
11879
11880			_match:
11881	0		_trans = _english_tokenizer_split_token_indicies[_trans];
11882	0		cs = _english_tokenizer_split_token_trans_targs[_trans];
11883
11884	0	0	if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 )
11885			goto _again;
11886
11887	0		switch ( _english_tokenizer_split_token_trans_actions[_trans] ) {
11888			case 1:
11889	0		{ split_mark = index - tokens.back().start + 1; }
11890	0		break;
11891			case 2:
11892	0		{ split_mark = index - tokens.back().start + 1; }
11893	0		{ split_len = split_mark; {( index)++; goto _out; } }
11894			break;
11895			}
11896
11897			_again:
11898	0	0	if ( cs == 0 )
11899			goto _out;
11900	0	0	if ( ++( index) != ( end) )
11901			goto _resume;
11902			_test_eof: {}
11903	0	0	if ( ( index) == ( end) )
11904			{
11905	0	0	switch ( _english_tokenizer_split_token_eof_actions[cs] ) {
11906			case 3:
11907	0		{ split_len = split_mark; {( index)++; goto _out; } }
11908			break;
11909			}
11910			}
11911
11912			_out: {}
11913			}
11914
11915	0	0	if (split_len && split_len < end) {
11916	0		tokens.back().length -= split_len;
11917	0		tokens.emplace_back(end - split_len, split_len);
11918			}
11919			}
11920
11921			static const char _english_tokenizer_cond_offsets[] = {
11922			0, 0, 0, 0, 0, 0, 0, 0,
11923			0, 0, 0, 2, 2, 2, 2, 2,
11924			2, 2, 2, 2, 2, 2, 2, 2,
11925			2, 2, 2, 2, 2
11926			};
11927
11928			static const char _english_tokenizer_cond_lengths[] = {
11929			0, 0, 0, 0, 0, 0, 0, 0,
11930			0, 0, 2, 0, 0, 0, 0, 0,
11931			0, 0, 0, 0, 0, 0, 0, 0,
11932			0, 0, 0, 0, 0
11933			};
11934
11935			static const short _english_tokenizer_cond_keys[] = {
11936			43u, 43u, 45u, 45u, 0
11937			};
11938
11939			static const char _english_tokenizer_cond_spaces[] = {
11940			1, 0, 0
11941			};
11942
11943			static const unsigned char _english_tokenizer_key_offsets[] = {
11944			0, 0, 17, 29, 43, 46, 49, 52,
11945			55, 60, 63, 98, 103, 107, 110, 114,
11946			119, 120, 125, 126, 131, 145, 152, 156,
11947			161, 164, 179, 192, 206
11948			};
11949
11950			static const short _english_tokenizer_trans_keys[] = {
11951			13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u,
11952			133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u,
11953			90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u,
11954			135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u,
11955			39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u,
11956			161u, 9u, 10u, 159u, 48u, 57u, 159u, 48u,
11957			57u, 159u, 48u, 57u, 159u, 48u, 57u, 43u,
11958			45u, 159u, 48u, 57u, 159u, 48u, 57u, 9u,
11959			10u, 13u, 32u, 33u, 44u, 46u, 47u, 63u,
11960			129u, 131u, 135u, 142u, 147u, 157u, 159u, 160u,
11961			301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u,
11962			58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u,
11963			123u, 255u, 9u, 10u, 13u, 32u, 147u, 9u,
11964			13u, 32u, 147u, 9u, 32u, 147u, 9u, 10u,
11965			32u, 147u, 9u, 10u, 13u, 32u, 147u, 13u,
11966			9u, 10u, 13u, 32u, 147u, 10u, 9u, 10u,
11967			13u, 32u, 147u, 13u, 32u, 34u, 39u, 41u,
11968			59u, 93u, 125u, 139u, 141u, 147u, 161u, 9u,
11969			10u, 44u, 46u, 69u, 101u, 159u, 48u, 57u,
11970			44u, 46u, 69u, 101u, 69u, 101u, 159u, 48u,
11971			57u, 159u, 48u, 57u, 39u, 45u, 129u, 131u,
11972			135u, 151u, 155u, 157u, 161u, 65u, 90u, 97u,
11973			122u, 142u, 143u, 45u, 129u, 131u, 135u, 151u,
11974			155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u,
11975			39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u,
11976			65u, 90u, 97u, 122u, 142u, 143u, 159u, 48u,
11977			57u, 0
11978			};
11979
11980			static const char _english_tokenizer_single_lengths[] = {
11981			0, 13, 10, 12, 1, 1, 1, 1,
11982			3, 1, 21, 5, 4, 3, 4, 5,
11983			1, 5, 1, 5, 12, 5, 4, 3,
11984			1, 9, 7, 8, 1
11985			};
11986
11987			static const char _english_tokenizer_range_lengths[] = {
11988			0, 2, 1, 1, 1, 1, 1, 1,
11989			1, 1, 7, 0, 0, 0, 0, 0,
11990			0, 0, 0, 0, 1, 1, 0, 1,
11991			1, 3, 3, 3, 1
11992			};
11993
11994			static const unsigned char _english_tokenizer_index_offsets[] = {
11995			0, 0, 16, 28, 42, 45, 48, 51,
11996			54, 59, 62, 91, 97, 102, 106, 111,
11997			117, 119, 125, 127, 133, 147, 154, 159,
11998			164, 167, 180, 191, 203
11999			};
12000
12001			static const char _english_tokenizer_indicies[] = {
12002			1, 1, 2, 2, 2, 2, 2, 3,
12003			2, 3, 1, 2, 2, 1, 3, 0,
12004			2, 2, 2, 2, 2, 3, 2, 3,
12005			2, 2, 3, 0, 4, 4, 5, 5,
12006			5, 5, 5, 5, 5, 5, 4, 5,
12007			4, 0, 6, 6, 0, 7, 7, 0,
12008			8, 8, 0, 9, 9, 0, 10, 10,
12009			11, 11, 0, 11, 11, 0, 13, 14,
12010			15, 13, 16, 12, 16, 12, 16, 19,
12011			19, 19, 19, 13, 19, 18, 16, 12,
12012			20, 12, 20, 12, 18, 12, 19, 12,
12013			19, 12, 17, 13, 22, 23, 13, 13,
12014			21, 13, 24, 13, 13, 21, 13, 13,
12015			13, 21, 13, 24, 13, 13, 21, 13,
12016			25, 26, 13, 13, 21, 28, 27, 13,
12017			25, 29, 13, 13, 21, 28, 27, 13,
12018			26, 29, 13, 13, 21, 4, 4, 5,
12019			5, 5, 5, 5, 5, 5, 5, 4,
12020			5, 4, 30, 31, 32, 33, 33, 18,
12021			18, 30, 31, 32, 33, 33, 30, 33,
12022			33, 9, 9, 30, 11, 11, 30, 34,
12023			35, 19, 19, 19, 19, 19, 19, 34,
12024			19, 19, 19, 30, 35, 19, 19, 19,
12025			19, 19, 19, 19, 19, 19, 30, 34,
12026			19, 19, 19, 19, 19, 19, 34, 19,
12027			19, 19, 30, 18, 18, 30, 0
12028			};
12029
12030			static const char _english_tokenizer_trans_targs[] = {
12031			10, 1, 2, 10, 1, 3, 5, 6,
12032			22, 23, 9, 24, 10, 11, 15, 19,
12033			20, 0, 21, 25, 28, 10, 12, 14,
12034			13, 16, 17, 10, 10, 18, 10, 4,
12035			7, 8, 26, 27
12036			};
12037
12038			static const char _english_tokenizer_trans_actions[] = {
12039			1, 0, 0, 2, 3, 0, 0, 0,
12040			4, 4, 0, 0, 7, 0, 0, 0,
12041			4, 0, 4, 0, 0, 8, 0, 0,
12042			0, 0, 0, 9, 10, 0, 11, 0,
12043			0, 0, 0, 0
12044			};
12045
12046			static const char _english_tokenizer_to_state_actions[] = {
12047			0, 0, 0, 0, 0, 0, 0, 0,
12048			0, 0, 5, 0, 0, 0, 0, 0,
12049			0, 0, 0, 0, 0, 0, 0, 0,
12050			0, 0, 0, 0, 0
12051			};
12052
12053			static const char _english_tokenizer_from_state_actions[] = {
12054			0, 0, 0, 0, 0, 0, 0, 0,
12055			0, 0, 6, 0, 0, 0, 0, 0,
12056			0, 0, 0, 0, 0, 0, 0, 0,
12057			0, 0, 0, 0, 0
12058			};
12059
12060			static const unsigned char _english_tokenizer_eof_trans[] = {
12061			0, 1, 1, 1, 1, 1, 1, 1,
12062			1, 1, 0, 22, 22, 22, 22, 22,
12063			28, 22, 28, 22, 31, 31, 31, 31,
12064			31, 31, 31, 31, 31
12065			};
12066
12067			static const int english_tokenizer_start = 10;
12068
12069	0	0	english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
		0
		0
		0
12070
12071	0		bool english_tokenizer::next_sentence(vector& tokens) {
12072			using namespace unilib;
12073
12074			int cs, act;
12075			size_t ts, te;
12076			size_t whitespace = 0; // Suppress "may be uninitialized" warning
12077
12078	0	0	while (tokenize_url_email(tokens))
12079	0	0	if (emergency_sentence_split(tokens))
12080			return true;
12081
12082			{
12083			cs = english_tokenizer_start;
12084	0		ts = 0;
12085			te = 0;
12086			act = 0;
12087			}
12088
12089			{
12090			int _klen;
12091			const short *_keys;
12092			int _trans;
12093			short _widec;
12094
12095	0	0	if ( ( current) == ( (chars.size() - 1)) )
12096			goto _test_eof;
12097			if ( cs == 0 )
12098			goto _out;
12099			_resume:
12100	0	0	switch ( _english_tokenizer_from_state_actions[cs] ) {
12101			case 6:
12102	0		{ts = ( current);}
12103	0		break;
12104			}
12105
12106	0		_widec = ( ragel_char(chars[current]));
12107	0		_klen = _english_tokenizer_cond_lengths[cs];
12108	0		_keys = _english_tokenizer_cond_keys + (_english_tokenizer_cond_offsets[cs]*2);
12109	0	0	if ( _klen > 0 ) {
12110			const short *_lower = _keys;
12111			const short *_mid;
12112	0		const short *_upper = _keys + (_klen<<1) - 2;
12113			while (1) {
12114	0	0	if ( _upper < _lower )
12115			break;
12116
12117	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
12118	0	0	if ( _widec < _mid[0] )
12119	0		_upper = _mid - 2;
12120	0	0	else if ( _widec > _mid[1] )
12121	0		_lower = _mid + 2;
12122			else {
12123	0		switch ( _english_tokenizer_cond_spaces[_english_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
12124			case 0: {
12125	0		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
12126	0	0	if (
12127	0	0	!current \|\| (chars[current-1].cat & ~(unicode::L \| unicode::M \| unicode::N \| unicode::Pd)) ) _widec += 256;
		0
12128			break;
12129			}
12130			case 1: {
12131	0		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
12132	0	0	if (
12133	0	0	!current \|\| ((chars[current-1].cat & ~(unicode::L \| unicode::M \| unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
		0
		0
12134			break;
12135			}
12136			}
12137			break;
12138			}
12139			}
12140			}
12141
12142	0		_keys = _english_tokenizer_trans_keys + _english_tokenizer_key_offsets[cs];
12143	0		_trans = _english_tokenizer_index_offsets[cs];
12144
12145	0		_klen = _english_tokenizer_single_lengths[cs];
12146	0	0	if ( _klen > 0 ) {
12147			const short *_lower = _keys;
12148			const short *_mid;
12149	0		const short *_upper = _keys + _klen - 1;
12150			while (1) {
12151	0	0	if ( _upper < _lower )
12152			break;
12153
12154	0		_mid = _lower + ((_upper-_lower) >> 1);
12155	0	0	if ( _widec < *_mid )
12156	0		_upper = _mid - 1;
12157	0	0	else if ( _widec > *_mid )
12158	0		_lower = _mid + 1;
12159			else {
12160	0		_trans += (unsigned int)(_mid - _keys);
12161	0		goto _match;
12162			}
12163			}
12164	0		_keys += _klen;
12165	0		_trans += _klen;
12166			}
12167
12168	0		_klen = _english_tokenizer_range_lengths[cs];
12169	0	0	if ( _klen > 0 ) {
12170			const short *_lower = _keys;
12171			const short *_mid;
12172	0		const short *_upper = _keys + (_klen<<1) - 2;
12173			while (1) {
12174	0	0	if ( _upper < _lower )
12175			break;
12176
12177	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
12178	0	0	if ( _widec < _mid[0] )
12179	0		_upper = _mid - 2;
12180	0	0	else if ( _widec > _mid[1] )
12181	0		_lower = _mid + 2;
12182			else {
12183	0		_trans += (unsigned int)((_mid - _keys)>>1);
12184	0		goto _match;
12185			}
12186			}
12187	0		_trans += _klen;
12188			}
12189
12190			_match:
12191	0		_trans = _english_tokenizer_indicies[_trans];
12192			_eof_trans:
12193	0		cs = _english_tokenizer_trans_targs[_trans];
12194
12195	0	0	if ( _english_tokenizer_trans_actions[_trans] == 0 )
12196			goto _again;
12197
12198	0		switch ( _english_tokenizer_trans_actions[_trans] ) {
12199			case 3:
12200	0		{ whitespace = current; }
12201	0		break;
12202			case 4:
12203	0		{te = ( current)+1;}
12204	0		break;
12205			case 7:
12206	0		{te = ( current)+1;{ tokens.emplace_back(ts, te - ts);
12207	0		split_token(tokens);
12208	0		current = te;
12209	0	0	do
12210	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12211			while (tokenize_url_email(tokens));
12212	0		( current)--;
12213			}}
12214	0		break;
12215			case 2:
12216	0		{te = ( current)+1;{
12217	0		bool eos = is_eos(tokens, chars[ts].chr, &abbreviations);
12218	0	0	for (current = ts; current < whitespace; current++)
12219	0		tokens.emplace_back(current, 1);
12220	0		{( current) = (( whitespace))-1;}
12221	0	0	if (eos) {( current)++; goto _out; }
12222			}}
12223			break;
12224			case 10:
12225	0		{te = ( current)+1;{
12226	0	0	if (!tokens.empty()) {( current)++; goto _out; }
12227	0		current = te;
12228	0	0	do
12229	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12230			while (tokenize_url_email(tokens));
12231	0		( current)--;
12232			}}
12233	0		break;
12234			case 11:
12235	0		{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts);
12236	0		split_token(tokens);
12237	0		current = te;
12238	0	0	do
12239	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12240			while (tokenize_url_email(tokens));
12241	0		( current)--;
12242			}}
12243	0		break;
12244			case 8:
12245	0		{te = ( current);( current)--;{
12246	0		current = te;
12247	0	0	do
12248	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12249			while (tokenize_url_email(tokens));
12250	0		( current)--;
12251			}}
12252	0		break;
12253			case 9:
12254	0		{te = ( current);( current)--;{
12255	0	0	if (!tokens.empty()) {( current)++; goto _out; }
12256	0		current = te;
12257	0	0	do
12258	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12259			while (tokenize_url_email(tokens));
12260	0		( current)--;
12261			}}
12262	0		break;
12263			case 1:
12264	0		{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts);
12265	0		split_token(tokens);
12266	0		current = te;
12267	0	0	do
12268	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12269			while (tokenize_url_email(tokens));
12270	0		( current)--;
12271			}}
12272	0		break;
12273			}
12274
12275			_again:
12276	0	0	switch ( _english_tokenizer_to_state_actions[cs] ) {
12277			case 5:
12278	0		{ts = 0;}
12279	0		break;
12280			}
12281
12282	0	0	if ( cs == 0 )
12283			goto _out;
12284	0	0	if ( ++( current) != ( (chars.size() - 1)) )
12285			goto _resume;
12286			_test_eof: {}
12287	0	0	if ( ( current) == ( (chars.size() - 1)) )
12288			{
12289	0	0	if ( _english_tokenizer_eof_trans[cs] > 0 ) {
12290	0		_trans = _english_tokenizer_eof_trans[cs] - 1;
12291	0		goto _eof_trans;
12292			}
12293			}
12294
12295			_out: {}
12296			}
12297
12298			(void)act; // Suppress unused variable warning
12299
12300	0		return !tokens.empty();
12301			}
12302
12303			} // namespace morphodita
12304
12305			/////////
12306			// File: morphodita/tokenizer/generic_tokenizer.cpp
12307			/////////
12308
12309			// This file is part of MorphoDiTa .
12310			//
12311			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
12312			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12313			//
12314			// This Source Code Form is subject to the terms of the Mozilla Public
12315			// License, v. 2.0. If a copy of the MPL was not distributed with this
12316			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12317
12318			namespace morphodita {
12319
12320			static const char _generic_tokenizer_cond_offsets[] = {
12321			0, 0, 0, 0, 0, 0, 0, 0,
12322			2, 2, 2, 2, 2, 2, 2, 2,
12323			2, 2, 2, 2, 2, 2, 2
12324			};
12325
12326			static const char _generic_tokenizer_cond_lengths[] = {
12327			0, 0, 0, 0, 0, 0, 0, 2,
12328			0, 0, 0, 0, 0, 0, 0, 0,
12329			0, 0, 0, 0, 0, 0, 0
12330			};
12331
12332			static const short _generic_tokenizer_cond_keys[] = {
12333			43u, 43u, 45u, 45u, 0
12334			};
12335
12336			static const char _generic_tokenizer_cond_spaces[] = {
12337			1, 0, 0
12338			};
12339
12340			static const unsigned char _generic_tokenizer_key_offsets[] = {
12341			0, 0, 17, 29, 43, 46, 51, 54,
12342			89, 94, 98, 101, 105, 110, 111, 116,
12343			117, 122, 136, 142, 147, 150, 162
12344			};
12345
12346			static const short _generic_tokenizer_trans_keys[] = {
12347			13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u,
12348			133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u,
12349			90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u,
12350			135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u,
12351			39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u,
12352			161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u,
12353			159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u,
12354			13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u,
12355			131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u,
12356			557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u,
12357			64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u,
12358			255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u,
12359			32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u,
12360			147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u,
12361			10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u,
12362			32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u,
12363			93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u,
12364			46u, 69u, 101u, 159u, 48u, 57u, 69u, 101u,
12365			159u, 48u, 57u, 159u, 48u, 57u, 129u, 131u,
12366			135u, 151u, 155u, 157u, 65u, 90u, 97u, 122u,
12367			142u, 143u, 159u, 48u, 57u, 0
12368			};
12369
12370			static const char _generic_tokenizer_single_lengths[] = {
12371			0, 13, 10, 12, 1, 3, 1, 21,
12372			5, 4, 3, 4, 5, 1, 5, 1,
12373			5, 12, 4, 3, 1, 6, 1
12374			};
12375
12376			static const char _generic_tokenizer_range_lengths[] = {
12377			0, 2, 1, 1, 1, 1, 1, 7,
12378			0, 0, 0, 0, 0, 0, 0, 0,
12379			0, 1, 1, 1, 1, 3, 1
12380			};
12381
12382			static const unsigned char _generic_tokenizer_index_offsets[] = {
12383			0, 0, 16, 28, 42, 45, 50, 53,
12384			82, 88, 93, 97, 102, 108, 110, 116,
12385			118, 124, 138, 144, 149, 152, 162
12386			};
12387
12388			static const char _generic_tokenizer_indicies[] = {
12389			1, 1, 2, 2, 2, 2, 2, 3,
12390			2, 3, 1, 2, 2, 1, 3, 0,
12391			2, 2, 2, 2, 2, 3, 2, 3,
12392			2, 2, 3, 0, 4, 4, 5, 5,
12393			5, 5, 5, 5, 5, 5, 4, 5,
12394			4, 0, 6, 6, 0, 7, 7, 8,
12395			8, 0, 8, 8, 0, 10, 11, 12,
12396			10, 13, 9, 13, 9, 13, 16, 16,
12397			16, 16, 10, 16, 15, 13, 9, 17,
12398			9, 17, 9, 15, 9, 16, 9, 16,
12399			9, 14, 10, 19, 20, 10, 10, 18,
12400			10, 21, 10, 10, 18, 10, 10, 10,
12401			18, 10, 21, 10, 10, 18, 10, 22,
12402			23, 10, 10, 18, 25, 24, 10, 22,
12403			26, 10, 10, 18, 25, 24, 10, 23,
12404			26, 10, 10, 18, 4, 4, 5, 5,
12405			5, 5, 5, 5, 5, 5, 4, 5,
12406			4, 27, 28, 29, 29, 15, 15, 27,
12407			29, 29, 6, 6, 27, 8, 8, 27,
12408			16, 16, 16, 16, 16, 16, 16, 16,
12409			16, 27, 15, 15, 27, 0
12410			};
12411
12412			static const char _generic_tokenizer_trans_targs[] = {
12413			7, 1, 2, 7, 1, 3, 19, 6,
12414			20, 7, 8, 12, 16, 17, 0, 18,
12415			21, 22, 7, 9, 11, 10, 13, 14,
12416			7, 7, 15, 7, 4, 5
12417			};
12418
12419			static const char _generic_tokenizer_trans_actions[] = {
12420			1, 0, 0, 2, 3, 0, 4, 0,
12421			0, 7, 0, 0, 0, 4, 0, 4,
12422			0, 0, 8, 0, 0, 0, 0, 0,
12423			9, 10, 0, 11, 0, 0
12424			};
12425
12426			static const char _generic_tokenizer_to_state_actions[] = {
12427			0, 0, 0, 0, 0, 0, 0, 5,
12428			0, 0, 0, 0, 0, 0, 0, 0,
12429			0, 0, 0, 0, 0, 0, 0
12430			};
12431
12432			static const char _generic_tokenizer_from_state_actions[] = {
12433			0, 0, 0, 0, 0, 0, 0, 6,
12434			0, 0, 0, 0, 0, 0, 0, 0,
12435			0, 0, 0, 0, 0, 0, 0
12436			};
12437
12438			static const unsigned char _generic_tokenizer_eof_trans[] = {
12439			0, 1, 1, 1, 1, 1, 1, 0,
12440			19, 19, 19, 19, 19, 25, 19, 25,
12441			19, 28, 28, 28, 28, 28, 28
12442			};
12443
12444			static const int generic_tokenizer_start = 7;
12445
12446	0	0	generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {}
		0
		0
		0
12447
12448	0		bool generic_tokenizer::next_sentence(vector& tokens) {
12449			using namespace unilib;
12450
12451			int cs, act;
12452			size_t ts, te;
12453			size_t whitespace = 0; // Suppress "may be uninitialized" warning
12454
12455	0	0	while (tokenize_url_email(tokens))
12456	0	0	if (emergency_sentence_split(tokens))
12457			return true;
12458
12459			{
12460			cs = generic_tokenizer_start;
12461	0		ts = 0;
12462			te = 0;
12463			act = 0;
12464			}
12465
12466			{
12467			int _klen;
12468			const short *_keys;
12469			int _trans;
12470			short _widec;
12471
12472	0	0	if ( ( current) == ( (chars.size() - 1)) )
12473			goto _test_eof;
12474			if ( cs == 0 )
12475			goto _out;
12476			_resume:
12477	0	0	switch ( _generic_tokenizer_from_state_actions[cs] ) {
12478			case 6:
12479	0		{ts = ( current);}
12480	0		break;
12481			}
12482
12483	0		_widec = ( ragel_char(chars[current]));
12484	0		_klen = _generic_tokenizer_cond_lengths[cs];
12485	0		_keys = _generic_tokenizer_cond_keys + (_generic_tokenizer_cond_offsets[cs]*2);
12486	0	0	if ( _klen > 0 ) {
12487			const short *_lower = _keys;
12488			const short *_mid;
12489	0		const short *_upper = _keys + (_klen<<1) - 2;
12490			while (1) {
12491	0	0	if ( _upper < _lower )
12492			break;
12493
12494	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
12495	0	0	if ( _widec < _mid[0] )
12496	0		_upper = _mid - 2;
12497	0	0	else if ( _widec > _mid[1] )
12498	0		_lower = _mid + 2;
12499			else {
12500	0		switch ( _generic_tokenizer_cond_spaces[_generic_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
12501			case 0: {
12502	0		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
12503	0	0	if (
12504	0	0	!current \|\| (chars[current-1].cat & ~(unicode::L \| unicode::M \| unicode::N \| unicode::Pd)) ) _widec += 256;
		0
12505			break;
12506			}
12507			case 1: {
12508	0		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
12509	0	0	if (
12510	0	0	!current \|\| ((chars[current-1].cat & ~(unicode::L \| unicode::M \| unicode::N)) && chars[current-1].chr != '+') ) _widec += 256;
		0
		0
12511			break;
12512			}
12513			}
12514			break;
12515			}
12516			}
12517			}
12518
12519	0		_keys = _generic_tokenizer_trans_keys + _generic_tokenizer_key_offsets[cs];
12520	0		_trans = _generic_tokenizer_index_offsets[cs];
12521
12522	0		_klen = _generic_tokenizer_single_lengths[cs];
12523	0	0	if ( _klen > 0 ) {
12524			const short *_lower = _keys;
12525			const short *_mid;
12526	0		const short *_upper = _keys + _klen - 1;
12527			while (1) {
12528	0	0	if ( _upper < _lower )
12529			break;
12530
12531	0		_mid = _lower + ((_upper-_lower) >> 1);
12532	0	0	if ( _widec < *_mid )
12533	0		_upper = _mid - 1;
12534	0	0	else if ( _widec > *_mid )
12535	0		_lower = _mid + 1;
12536			else {
12537	0		_trans += (unsigned int)(_mid - _keys);
12538	0		goto _match;
12539			}
12540			}
12541	0		_keys += _klen;
12542	0		_trans += _klen;
12543			}
12544
12545	0		_klen = _generic_tokenizer_range_lengths[cs];
12546	0	0	if ( _klen > 0 ) {
12547			const short *_lower = _keys;
12548			const short *_mid;
12549	0		const short *_upper = _keys + (_klen<<1) - 2;
12550			while (1) {
12551	0	0	if ( _upper < _lower )
12552			break;
12553
12554	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
12555	0	0	if ( _widec < _mid[0] )
12556	0		_upper = _mid - 2;
12557	0	0	else if ( _widec > _mid[1] )
12558	0		_lower = _mid + 2;
12559			else {
12560	0		_trans += (unsigned int)((_mid - _keys)>>1);
12561	0		goto _match;
12562			}
12563			}
12564	0		_trans += _klen;
12565			}
12566
12567			_match:
12568	0		_trans = _generic_tokenizer_indicies[_trans];
12569			_eof_trans:
12570	0		cs = _generic_tokenizer_trans_targs[_trans];
12571
12572	0	0	if ( _generic_tokenizer_trans_actions[_trans] == 0 )
12573			goto _again;
12574
12575	0		switch ( _generic_tokenizer_trans_actions[_trans] ) {
12576			case 3:
12577	0		{ whitespace = current; }
12578	0		break;
12579			case 4:
12580	0		{te = ( current)+1;}
12581	0		break;
12582			case 7:
12583	0		{te = ( current)+1;{ tokens.emplace_back(ts, te - ts);
12584	0		current = te;
12585	0	0	do
12586	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12587			while (tokenize_url_email(tokens));
12588	0		( current)--;
12589			}}
12590	0		break;
12591			case 2:
12592	0		{te = ( current)+1;{
12593	0		bool eos = is_eos(tokens, chars[ts].chr, nullptr);
12594	0	0	for (current = ts; current < whitespace; current++)
12595	0		tokens.emplace_back(current, 1);
12596	0		{( current) = (( whitespace))-1;}
12597	0	0	if (eos) {( current)++; goto _out; }
12598			}}
12599			break;
12600			case 10:
12601	0		{te = ( current)+1;{
12602	0	0	if (!tokens.empty()) {( current)++; goto _out; }
12603	0		current = te;
12604	0	0	do
12605	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12606			while (tokenize_url_email(tokens));
12607	0		( current)--;
12608			}}
12609	0		break;
12610			case 11:
12611	0		{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts);
12612	0		current = te;
12613	0	0	do
12614	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12615			while (tokenize_url_email(tokens));
12616	0		( current)--;
12617			}}
12618	0		break;
12619			case 8:
12620	0		{te = ( current);( current)--;{
12621	0		current = te;
12622	0	0	do
12623	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12624			while (tokenize_url_email(tokens));
12625	0		( current)--;
12626			}}
12627	0		break;
12628			case 9:
12629	0		{te = ( current);( current)--;{
12630	0	0	if (!tokens.empty()) {( current)++; goto _out; }
12631	0		current = te;
12632	0	0	do
12633	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12634			while (tokenize_url_email(tokens));
12635	0		( current)--;
12636			}}
12637	0		break;
12638			case 1:
12639	0		{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts);
12640	0		current = te;
12641	0	0	do
12642	0	0	if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } }
12643			while (tokenize_url_email(tokens));
12644	0		( current)--;
12645			}}
12646	0		break;
12647			}
12648
12649			_again:
12650	0	0	switch ( _generic_tokenizer_to_state_actions[cs] ) {
12651			case 5:
12652	0		{ts = 0;}
12653	0		break;
12654			}
12655
12656	0	0	if ( cs == 0 )
12657			goto _out;
12658	0	0	if ( ++( current) != ( (chars.size() - 1)) )
12659			goto _resume;
12660			_test_eof: {}
12661	0	0	if ( ( current) == ( (chars.size() - 1)) )
12662			{
12663	0	0	if ( _generic_tokenizer_eof_trans[cs] > 0 ) {
12664	0		_trans = _generic_tokenizer_eof_trans[cs] - 1;
12665	0		goto _eof_trans;
12666			}
12667			}
12668
12669			_out: {}
12670			}
12671
12672			(void)act; // Suppress unused variable warning
12673
12674	0		return !tokens.empty();
12675			}
12676
12677			} // namespace morphodita
12678
12679			/////////
12680			// File: morphodita/tokenizer/generic_tokenizer_factory.h
12681			/////////
12682
12683			// This file is part of MorphoDiTa .
12684			//
12685			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12686			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12687			//
12688			// This Source Code Form is subject to the terms of the Mozilla Public
12689			// License, v. 2.0. If a copy of the MPL was not distributed with this
12690			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12691
12692			namespace morphodita {
12693
12694	0		class generic_tokenizer_factory : public tokenizer_factory {
12695			public:
12696			// Construct a new tokenizer instance.
12697			virtual tokenizer* new_tokenizer(const morpho* m) const override;
12698
12699			bool load(istream& is);
12700			private:
12701			unsigned version;
12702			};
12703
12704			} // namespace morphodita
12705
12706			/////////
12707			// File: morphodita/tokenizer/generic_tokenizer_factory.cpp
12708			/////////
12709
12710			// This file is part of MorphoDiTa .
12711			//
12712			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12713			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12714			//
12715			// This Source Code Form is subject to the terms of the Mozilla Public
12716			// License, v. 2.0. If a copy of the MPL was not distributed with this
12717			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12718
12719			namespace morphodita {
12720
12721	0		tokenizer* generic_tokenizer_factory::new_tokenizer(const morpho* /m/) const {
12722	0		return new generic_tokenizer(version);
12723			}
12724
12725	0		bool generic_tokenizer_factory::load(istream& is) {
12726	0	0	version = is.get();
12727
12728	0		return bool(is);
12729			}
12730
12731			} // namespace morphodita
12732
12733			/////////
12734			// File: morphodita/tokenizer/generic_tokenizer_factory_encoder.h
12735			/////////
12736
12737			// This file is part of MorphoDiTa .
12738			//
12739			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
12740			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12741			//
12742			// This Source Code Form is subject to the terms of the Mozilla Public
12743			// License, v. 2.0. If a copy of the MPL was not distributed with this
12744			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12745
12746			namespace morphodita {
12747
12748			class generic_tokenizer_factory_encoder {
12749			public:
12750			static void encode(unsigned version, ostream& os);
12751			};
12752
12753			} // namespace morphodita
12754
12755			/////////
12756			// File: morphodita/tokenizer/generic_tokenizer_factory_encoder.cpp
12757			/////////
12758
12759			// This file is part of MorphoDiTa .
12760			//
12761			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
12762			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12763			//
12764			// This Source Code Form is subject to the terms of the Mozilla Public
12765			// License, v. 2.0. If a copy of the MPL was not distributed with this
12766			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12767
12768			namespace morphodita {
12769
12770	0		void generic_tokenizer_factory_encoder::encode(unsigned version, ostream& os) {
12771	0	0	os.put(version);
12772	0		}
12773
12774			} // namespace morphodita
12775
12776			/////////
12777			// File: unilib/uninorms.h
12778			/////////
12779
12780			// This file is part of UniLib .
12781			//
12782			// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
12783			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12784			//
12785			// This Source Code Form is subject to the terms of the Mozilla Public
12786			// License, v. 2.0. If a copy of the MPL was not distributed with this
12787			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12788			//
12789			// UniLib version: 3.3.0
12790			// Unicode version: 15.0.0
12791
12792			namespace unilib {
12793
12794			class uninorms {
12795			public:
12796			static void nfc(std::u32string& str);
12797			static void nfd(std::u32string& str);
12798			static void nfkc(std::u32string& str);
12799			static void nfkd(std::u32string& str);
12800
12801			private:
12802			static void compose(std::u32string& str);
12803			static void decompose(std::u32string& str, bool kanonical);
12804
12805			static const char32_t CHARS = 0x110000;
12806
12807			struct Hangul {
12808			// Hangul decomposition and composition
12809			static const char32_t SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
12810			static const char32_t LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount;
12811			};
12812
12813			static const uint8_t ccc_index[CHARS >> 8];
12814			static const uint8_t ccc_block[][256];
12815
12816			static const uint8_t composition_index[CHARS >> 8];
12817			static const uint16_t composition_block[][257];
12818			static const char32_t composition_data[];
12819
12820			static const uint8_t decomposition_index[CHARS >> 8];
12821			static const uint16_t decomposition_block[][257];
12822			static const char32_t decomposition_data[];
12823			};
12824
12825			} // namespace unilib
12826
12827			/////////
12828			// File: morphodita/tokenizer/gru_tokenizer_network.h
12829			/////////
12830
12831			// This file is part of MorphoDiTa .
12832			//
12833			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
12834			// Mathematics and Physics, Charles University in Prague, Czech Republic.
12835			//
12836			// This Source Code Form is subject to the terms of the Mozilla Public
12837			// License, v. 2.0. If a copy of the MPL was not distributed with this
12838			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
12839
12840			namespace morphodita {
12841
12842			// Declarations
12843
12844	1		class gru_tokenizer_network {
12845			public:
12846	1		virtual ~gru_tokenizer_network() {}
12847
12848			template struct matrix {
12849			float w[R][C];
12850			float b[R];
12851
12852			void clear();
12853			void load(binary_decoder& data);
12854			};
12855
12856			enum { NO_SPLIT, END_OF_TOKEN, END_OF_SENTENCE, OUTCOMES };
12857			struct outcome_t {
12858			int outcome;
12859			float w[3];
12860			const float* embedding;
12861			};
12862			struct char_info {
12863			char32_t chr;
12864			unilib::unicode::category_t cat;
12865
12866			char_info() {}
12867	34		char_info(char32_t chr, unilib::unicode::category_t cat) : chr(chr), cat(cat) {}
12868			};
12869
12870			virtual void classify(const vector& chars, vector& outcomes) const = 0;
12871
12872			static gru_tokenizer_network* load(binary_decoder& data);
12873			};
12874
12875			template
12876	2		class gru_tokenizer_network_implementation : public gru_tokenizer_network {
12877			public:
12878			virtual void classify(const vector& chars, vector& outcomes) const override;
12879
12880			static gru_tokenizer_network_implementation* load(binary_decoder& data);
12881
12882			protected:
12883			void cache_embeddings();
12884
12885			struct cached_embedding {
12886			matrix<1, D> e;
12887			matrix<6, D> cache;
12888			};
12889
12890			struct gru {
12891			matrix X, X_r, X_z;
12892			matrix H, H_r, H_z;
12893
12894			void load(binary_decoder& data);
12895			};
12896
12897			unordered_map embeddings;
12898			cached_embedding empty_embedding;
12899			gru gru_fwd, gru_bwd;
12900			matrix<3, D> projection_fwd, projection_bwd;
12901			unordered_map unknown_chars;
12902			};
12903
12904			// Definitions
12905
12906			template
12907			void gru_tokenizer_network::matrix::clear() {
12908	4	100	for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f);
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
12909	0		fill_n(b, R, 0.f);
12910			}
12911
12912			template
12913	28		void gru_tokenizer_network::matrix::load(binary_decoder& data) {
12914	212	0	for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C);
		0
		100
		0
		0
		100
12915	14		memcpy(b, data.next(R), sizeof(float) * R);
12916	14		}
12917
12918			template
12919	2		void gru_tokenizer_network_implementation::gru::load(binary_decoder& data) {
12920	2		X.load(data);
12921	2		X_r.load(data);
12922	2		X_z.load(data);
12923	2		H.load(data);
12924	2		H_r.load(data);
12925	2		H_z.load(data);
12926	2		}
12927
12928			template
12929	1		void gru_tokenizer_network_implementation::classify(const vector& chars, vector& outcomes) const {
12930	2	50	if (chars.empty()) return;
		0
		0
12931
12932			// Resolve embeddings, possibly with unknown_chars or empty_embedding
12933			u32string decomposition;
12934	35	100	for (size_t i = 0; i < chars.size(); i++) {
		0
		0
12935	34		auto embedding = embeddings.find(chars[i].chr);
12936
12937			// Try finding substitute character if not found, by using NFKD
12938			// and by replacing IDEOGRAPHIC FULL STOP/COMMA.
12939	34		if (embedding == embeddings.end()) {
12940	0		decomposition.assign(1, chars[i].chr);
12941			unilib::uninorms::nfkd(decomposition);
12942	0	0	if (decomposition[0] == 0x3001) decomposition[0] = char32_t(',');
		0
		0
12943	0	0	if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.');
		0
		0
12944	0	0	if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]);
		0
		0
12945			}
12946
12947	34	50	if (embedding != embeddings.end()) {
		0
		0
12948	34		outcomes[i].embedding = embedding->second.cache.w[0];
12949			} else {
12950	0		auto unknown_char = unknown_chars.find(chars[i].cat);
12951	0		if (unknown_char != unknown_chars.end()) embedding = embeddings.find(unknown_char->second);
12952	0	0	outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0];
		0
		0
12953			}
12954			}
12955
12956			// Clear outcome probabilities
12957	35	100	for (auto&& outcome : outcomes)
		0
		0
12958	136	100	for (int i = 0; i < 3; i++)
		0
		0
12959	102		outcome.w[i] = projection_fwd.b[i];
12960
12961			// Perform forward & backward GRU
12962			matrix<1, D> state, update, reset, candidate;
12963	3	100	for (int dir = 0; dir < 2; dir++) {
		0
		0
12964	2	100	auto& gru = dir == 0 ? gru_fwd : gru_bwd;
		0
		0
12965	2	100	auto& projection = dir == 0 ? projection_fwd : projection_bwd;
		0
		0
12966
12967			state.clear();
12968	70	100	for (size_t i = 0; i < outcomes.size(); i++) {
		0
		0
12969	68	100	auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i];
		0
		0
12970	68	100	auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D;
		0
		0
12971
12972	1156	100	for (int j = 0; j < D; j++) {
		0
		0
12973	1088		update.w[0][j] = gru.X_z.b[j] + embedding_cache[2*D + j];
12974	1088		reset.w[0][j] = gru.X_r.b[j] + embedding_cache[D + j];
12975	18496	100	for (int k = 0; k < D; k++) {
		0
		0
12976	17408		update.w[0][j] += state.w[0][k] * gru.H_z.w[j][k];
12977	17408		reset.w[0][j] += state.w[0][k] * gru.H_r.w[j][k];
12978			}
12979	2176		update.w[0][j] = 1.f / (1.f + exp(-update.w[0][j]));
12980	2176		reset.w[0][j] = 1.f / (1.f + exp(-reset.w[0][j]));
12981	1088		reset.w[0][j] *= state.w[0][j];
12982			}
12983	1156	100	for (int j = 0; j < D; j++) {
		0
		0
12984	1088		candidate.w[0][j] = gru.X.b[j] + embedding_cache[j];
12985	18496	100	for (int k = 0; k < D; k++)
		0
		0
12986	17408		candidate.w[0][j] += reset.w[0][k] * gru.H.w[j][k];
12987	1088		candidate.w[0][j] = tanh(candidate.w[0][j]);
12988	1088		state.w[0][j] = update.w[0][j] * state.w[0][j] + (1.f - update.w[0][j]) * candidate.w[0][j];
12989			}
12990
12991	272	100	for (int j = 0; j < 3; j++)
		0
		0
12992	3468	100	for (int k = 0; k < D; k++)
		0
		0
12993	3264		outcome.w[j] += projection.w[j][k] * state.w[0][k];
12994			}
12995			}
12996
12997			// Choose the outcome with the highest weight
12998	35	100	for (auto&& outcome : outcomes) {
		0
		0
12999	34		outcome.outcome = outcome.w[1] > outcome.w[0];
13000	34	100	if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2;
		0
		0
13001			}
13002			}
13003
13004			template
13005	1		gru_tokenizer_network_implementation* gru_tokenizer_network_implementation::load(binary_decoder& data) {
13006	1		unique_ptr> network(new gru_tokenizer_network_implementation());
13007
13008	21	0	for (unsigned chars = data.next_4B(); chars; chars--) {
		0
		0
		0
		50
		100
13009	20	0	auto& embedding = network->embeddings[data.next_4B()];
		0
		50
13010	20	0	copy_n(data.next(D), D, embedding.e.w[0]);
		0
		50
13011			}
13012	1		fill_n(network->empty_embedding.e.w[0], D, 0.f);
13013
13014	1	0	network->gru_fwd.load(data);
		0
		50
13015	1	0	network->gru_bwd.load(data);
		0
		50
13016	1	0	network->projection_fwd.load(data);
		0
		50
13017	1	0	network->projection_bwd.load(data);
		0
		50
13018
13019			network->unknown_chars.clear();
13020	5	0	for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) {
		0
		0
		0
		50
		100
13021	4	0	unilib::unicode::category_t cat = data.next_4B();
		0
		50
13022	4	0	network->unknown_chars[cat] = data.next_4B();
		0
		50
13023			}
13024
13025	1		network->cache_embeddings();
13026
13027	1		return network.release();
13028			}
13029
13030			template
13031	2		void gru_tokenizer_network_implementation::cache_embeddings() {
13032	21	0	for (auto&& embedding : embeddings) {
		0
		100
13033			auto& e = embedding.second.e;
13034			auto& cache = embedding.second.cache;
13035
13036	140	0	for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f);
		0
		100
13037	5460	0	for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j];
		0
		0
		0
		100
		100
13038	5460	0	for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j];
		0
		0
		0
		100
		100
13039	5460	0	for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j];
		0
		0
		0
		100
		100
13040	5460	0	for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j];
		0
		0
		0
		100
		100
13041	5460	0	for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j];
		0
		0
		0
		100
		100
13042	5460	0	for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j];
		0
		0
		0
		100
		100
13043			}
13044	7	0	for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f);
		0
		100
13045	1		}
13046
13047			} // namespace morphodita
13048
13049			/////////
13050			// File: morphodita/tokenizer/gru_tokenizer.h
13051			/////////
13052
13053			// This file is part of MorphoDiTa .
13054			//
13055			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13056			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13057			//
13058			// This Source Code Form is subject to the terms of the Mozilla Public
13059			// License, v. 2.0. If a copy of the MPL was not distributed with this
13060			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13061
13062			namespace morphodita {
13063
13064	4		class gru_tokenizer : public unicode_tokenizer {
13065			public:
13066			gru_tokenizer(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const gru_tokenizer_network& network)
13067	1	0	: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {}
		0
		0
		50
13068
13069			virtual bool next_sentence(vector& tokens) override;
13070
13071			private:
13072			inline bool is_space(size_t index);
13073			int next_outcome();
13074
13075			unsigned segment;
13076			bool allow_spaces;
13077			unsigned network_index, network_length;
13078			vector network_chars;
13079			vector network_outcomes;
13080			vector network_offsets;
13081			const gru_tokenizer_network& network;
13082			};
13083
13084			} // namespace morphodita
13085
13086			/////////
13087			// File: morphodita/tokenizer/gru_tokenizer.cpp
13088			/////////
13089
13090			// This file is part of MorphoDiTa .
13091			//
13092			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13093			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13094			//
13095			// This Source Code Form is subject to the terms of the Mozilla Public
13096			// License, v. 2.0. If a copy of the MPL was not distributed with this
13097			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13098
13099			namespace morphodita {
13100
13101			bool gru_tokenizer::is_space(size_t index) {
13102	83	100	return (chars[index].cat & unilib::unicode::Zs) \|\| chars[index].chr == '\r' \|\| chars[index].chr == '\n' \|\| chars[index].chr == '\t';
		50
		50
		50
		50
		50
		50
		50
		100
		50
		50
		50
		100
		50
		50
		50
13103			}
13104
13105	2		bool gru_tokenizer::next_sentence(vector& tokens) {
13106			tokens.clear();
13107
13108			// Reset tokenizer on new text
13109	9	100	if (current == 0) network_index = network_length = 0;
13110
13111			// Tokenize until EOS
13112	9	100	for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) {
		50
		100
13113	25	100	while (current < chars.size() - 1 && is_space(current))
		100
		100
13114	5	50	if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
		0
		50
13115			break;
13116
13117	8	100	if (current >= chars.size() - 1) break;
13118
13119			// We have a beginning of a token. Try if it is an URL.
13120	7	50	if (tokenize_url_email(tokens)) {
13121	0	0	while (network_index < network_length && network_offsets[network_index] < current)
		0
		0
13122	0	0	if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty())
		0
		0
13123			eos = true;
13124	0		continue;
13125			}
13126
13127			// Slurp current token
13128	7		size_t token_start = current;
13129	22	50	do {
13130	29		int outcome = next_outcome();
13131	29		eos = outcome == gru_tokenizer_network::END_OF_SENTENCE;
13132	29	100	if (outcome != gru_tokenizer_network::NO_SPLIT) break;
13133	44		} while (current < chars.size() - 1);
13134	8		tokens.emplace_back(token_start, current - token_start);
13135			}
13136
13137	2		return !tokens.empty();
13138			}
13139
13140	34		int gru_tokenizer::next_outcome() {
13141	34	100	if (network_index >= network_length) {
13142			// Compute required window
13143	1		network_index = 0;
13144	1		network_length = 0;
13145			network_chars.clear();
13146			network_outcomes.clear();
13147			network_offsets.clear();
13148
13149			// Prepare data for the classification
13150	70	100	for (size_t offset = current;
13151	35	100	network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment;
		50
13152	34		network_length++, offset++) {
13153	34	100	if (is_space(offset)) {
13154	5		network_chars.emplace_back(' ', unilib::unicode::Zs);
13155	9	100	while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++;
		50
		50
13156			} else {
13157	29		network_chars.emplace_back(chars[offset].chr, chars[offset].cat);
13158			}
13159			}
13160			// Add a space to the end on the EOD
13161	1	50	if (network_length < segment && network_chars.back().chr != ' ')
		50
		50
13162	0		network_chars.emplace_back(' ', unilib::unicode::Zs);
13163	1		network_outcomes.resize(network_chars.size());
13164
13165			// Perform the classification
13166	34		network.classify(network_chars, network_outcomes);
13167
13168			// Add spacing token/sentence breaks
13169	34	100	for (size_t i = 0; i < network_length - 1; i++)
13170	33	100	if (is_space(network_offsets[i+1])) {
13171			// Detect EOS on the following space or \n\n or \r\n\r\n, or if there is end of text
13172	5		bool eos = network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_SENTENCE;
13173	5	100	if (i + 2 == network_length) eos = true;
13174	5	50	for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++)
		0
		50
13175	0	0	eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') \|\|
		0
		0
13176	0	0	(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n');
		0
		0
		0
13177	5	100	if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE;
13178
13179	5	100	if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT)
13180			// Force EOT if not allowing spaces, and also detect EOT on the following space
13181	4	50	if (!allow_spaces \|\| network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN)
		0
		50
13182	4		network_outcomes[i].outcome = gru_tokenizer_network::END_OF_TOKEN;
13183			}
13184
13185			// Adjust network_length to suitable break
13186	1	50	if (network_length == segment && network_length >= 10) {
		0
13187	0		network_length -= 5;
13188	0	0	while (network_length > segment / 2)
13189	0	0	if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT)
13190			break;
13191			}
13192			}
13193	102		return current = network_offsets[network_index + 1], network_outcomes[network_index++].outcome;
13194			}
13195
13196			} // namespace morphodita
13197
13198			/////////
13199			// File: morphodita/tokenizer/gru_tokenizer_factory.h
13200			/////////
13201
13202			// This file is part of MorphoDiTa .
13203			//
13204			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13205			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13206			//
13207			// This Source Code Form is subject to the terms of the Mozilla Public
13208			// License, v. 2.0. If a copy of the MPL was not distributed with this
13209			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13210
13211			namespace morphodita {
13212
13213	2		class gru_tokenizer_factory : public tokenizer_factory {
13214			public:
13215			// Construct a new tokenizer instance.
13216			virtual tokenizer* new_tokenizer(const morpho* m) const override;
13217
13218			bool load(istream& is);
13219
13220			private:
13221			unsigned url_email_tokenizer;
13222			unsigned segment;
13223			bool allow_spaces;
13224
13225			unique_ptr network;
13226			};
13227
13228			} // namespace morphodita
13229
13230			/////////
13231			// File: morphodita/tokenizer/gru_tokenizer_factory.cpp
13232			/////////
13233
13234			// This file is part of MorphoDiTa .
13235			//
13236			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13237			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13238			//
13239			// This Source Code Form is subject to the terms of the Mozilla Public
13240			// License, v. 2.0. If a copy of the MPL was not distributed with this
13241			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13242
13243			namespace morphodita {
13244
13245	1		tokenizer* gru_tokenizer_factory::new_tokenizer(const morpho* /m/) const {
13246	2		return new gru_tokenizer(url_email_tokenizer, segment, allow_spaces, *network);
13247			}
13248
13249	1		bool gru_tokenizer_factory::load(istream& is) {
13250			char version;
13251	1	50	if (!is.get(version)) return false;
13252	1	50	if (!(version >= 1 && version <= 2)) return false;
13253
13254			binary_decoder data;
13255	1	50	if (!compressor::load(is, data)) return false;
		50
13256
13257			try {
13258	1	50	url_email_tokenizer = data.next_1B();
13259	1	50	segment = data.next_2B();
13260	1	50	allow_spaces = version >= 2 ? data.next_1B() : false /false was default for version 1/;
		0
		0
13261
13262	1	50	network.reset(gru_tokenizer_network::load(data));
13263	1	50	if (!network) return false;
		0
13264			} catch (binary_decoder_error&) {
13265			return false;
13266			}
13267
13268	1		return data.is_end();
13269			}
13270
13271			} // namespace morphodita
13272
13273			/////////
13274			// File: morphodita/tokenizer/gru_tokenizer_network.cpp
13275			/////////
13276
13277			// This file is part of MorphoDiTa .
13278			//
13279			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13280			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13281			//
13282			// This Source Code Form is subject to the terms of the Mozilla Public
13283			// License, v. 2.0. If a copy of the MPL was not distributed with this
13284			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13285
13286			namespace morphodita {
13287
13288	1		gru_tokenizer_network* gru_tokenizer_network::load(binary_decoder& data) {
13289	1	50	if (data.next_1B() != 1) return nullptr;
13290	1		switch (data.next_1B()) {
13291	1		case 16: return gru_tokenizer_network_implementation<16>::load(data);
13292	0		case 24: return gru_tokenizer_network_implementation<24>::load(data);
13293	0		case 64: return gru_tokenizer_network_implementation<64>::load(data);
13294			}
13295			return nullptr;
13296			}
13297
13298			} // namespace morphodita
13299
13300			/////////
13301			// File: morphodita/tokenizer/gru_tokenizer_trainer.h
13302			/////////
13303
13304			// This file is part of MorphoDiTa .
13305			//
13306			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13307			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13308			//
13309			// This Source Code Form is subject to the terms of the Mozilla Public
13310			// License, v. 2.0. If a copy of the MPL was not distributed with this
13311			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13312
13313			namespace morphodita {
13314
13315	0		struct tokenized_sentence {
13316			u32string sentence;
13317			vector tokens;
13318			};
13319
13320			class gru_tokenizer_trainer {
13321			public:
13322			enum { URL_EMAIL_LATEST = unicode_tokenizer::URL_EMAIL_LATEST };
13323
13324			static bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs,
13325			unsigned batch_size, float learning_rate, float learning_rate_final, float dropout,
13326			float initialization_range, bool early_stopping, const vector& data,
13327			const vector& heldout, ostream& os, string& error);
13328			};
13329
13330			} // namespace morphodita
13331
13332			/////////
13333			// File: morphodita/tokenizer/gru_tokenizer_network_trainer.h
13334			/////////
13335
13336			// This file is part of MorphoDiTa .
13337			//
13338			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13339			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13340			//
13341			// This Source Code Form is subject to the terms of the Mozilla Public
13342			// License, v. 2.0. If a copy of the MPL was not distributed with this
13343			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13344
13345			namespace morphodita {
13346
13347			//
13348			// Declarations
13349			//
13350
13351			template
13352	0	0	class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation {
		0
		0
13353			public:
13354			bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size,
13355			float learning_rate, float learning_rate_final, float dropout, float initialization_range,
13356			bool early_stopping, const vector& data, const vector& heldout,
13357			binary_encoder& enc, string& error);
13358
13359			private:
13360			template using matrix = typename gru_tokenizer_network_implementation::template matrix;
13361			using typename gru_tokenizer_network_implementation::cached_embedding;
13362			using typename gru_tokenizer_network_implementation::gru;
13363
13364			template struct matrix_trainer {
13365			matrix& original;
13366			float w_g[R][C], b_g[R];
13367			float w_m[R][C], b_m[R];
13368			float w_v[R][C], b_v[R];
13369
13370	0	0	matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {}
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
13371			void update_weights(float learning_rate);
13372			};
13373	0		struct gru_trainer {
13374			matrix_trainer X, X_r, X_z;
13375			matrix_trainer H, H_r, H_z;
13376			vector> states, updates, resets, resetstates, candidates, dropouts;
13377
13378	0		gru_trainer(gru& g, unsigned segment)
13379			: X(g.X), X_r(g.X_r), X_z(g.X_z), H(g.H), H_r(g.H_r), H_z(g.H_z), states(segment + 1),
13380	0	0	updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {}
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
13381			void update_weights(float learning_rate);
13382			};
13383
13384			struct f1_info { double precision, recall, f1; };
13385			void evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector& heldout,
13386			f1_info& tokens_f1, f1_info& sentences_f1);
13387			void evaluate_f1(const vector& system, const vector& gold, f1_info& f1);
13388
13389			template void random_matrix(matrix& m, mt19937& generator, float range, float bias);
13390			void random_gru(gru& g, mt19937& generator, float range);
13391
13392			template void save_matrix(const matrix& m, binary_encoder& enc);
13393			void save_gru(const gru& g, binary_encoder& enc);
13394			};
13395
13396			//
13397			// Definitions
13398			//
13399
13400			template
13401	0		bool gru_tokenizer_network_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size,
13402			float learning_rate_initial, float learning_rate_final, float dropout,
13403			float initialization_range, bool early_stopping, const vector& data,
13404			const vector& heldout, binary_encoder& enc, string& error) {
13405	0	0	if (segment < 10) return error.assign("Segment size must be at least 10!"), false;
		0
		0
13406
13407			unsigned characters = 0;
13408	0	0	for (auto&& sentence : data)
		0
		0
13409	0		characters += sentence.sentence.size();
13410	0	0	if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false;
		0
		0
13411
13412			mt19937 generator;
13413
13414	0		float dropout_multiplier = 1.f / (1.f - dropout);
13415	0		bernoulli_distribution dropout_distribution(dropout);
13416
13417			// Generate embeddings
13418	0	0	for (auto&& sentence : data)
		0
		0
13419	0	0	for (auto&& chr : sentence.sentence)
		0
		0
13420	0		if (!this->embeddings.count(chr)) {
13421			cached_embedding embedding;
13422	0		random_matrix(embedding.e, generator, initialization_range, 0.f);
13423			this->embeddings.emplace(chr, embedding);
13424			}
13425			this->empty_embedding.e.clear();
13426
13427			// Initialize weights
13428	0		random_gru(this->gru_fwd, generator, initialization_range);
13429	0		random_gru(this->gru_bwd, generator, initialization_range);
13430	0		random_matrix(this->projection_fwd, generator, initialization_range, 0.f); this->projection_fwd.b[this->NO_SPLIT] = 1.f;
13431	0		random_matrix(this->projection_bwd, generator, initialization_range, 0.f); this->projection_bwd.b[this->NO_SPLIT] = 1.f;
13432
13433			// Train the network
13434			unordered_map> embeddings;
13435	0	0	for (auto&& embedding : this->embeddings)
		0
		0
13436	0		embeddings.emplace(embedding.first, embedding.second.e);
13437	0	0	vector*> chosen_embeddings(segment);
		0
		0
13438	0	0	vector> embedding_dropouts(segment);
		0
		0
13439	0	0	gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment);
		0
		0
		0
		0
		0
13440	0		matrix_trainer<3, D> projection_fwd(this->projection_fwd), projection_bwd(this->projection_bwd);
13441			float learning_rate = learning_rate_initial, b1t = 1.f, b2t = 1.f;
13442
13443			float best_combined_f1 = 0.f; unsigned best_combined_f1_epoch = 0;
13444			gru_tokenizer_network_trainer best_combined_f1_network;
13445
13446			size_t training_offset = 0, training_shift;
13447	0	0	vector training_input, instance_input(segment);
		0
		0
13448	0	0	vector training_output, instance_output(segment);
		0
		0
13449	0	0	vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size());
		0
		0
13450	0	0	for (unsigned epoch = 0; epoch < epochs; epoch++) {
		0
		0
13451			double logprob = 0;
13452			int total = 0, correct = 0;
13453
13454	0	0	for (int instance = 0, instances = 10000; instance < instances; instance++) {
		0
		0
13455			// Prepare input instance
13456	0	0	if (training_offset + segment >= training_input.size()) {
		0
		0
13457	0		shuffle(permutation.begin(), permutation.end(), generator);
13458			training_input.clear(); training_output.clear();
13459	0	0	for (auto&& index : permutation) {
		0
		0
13460	0		auto& sentence = data[index];
13461	0	0	if (sentence.tokens.empty()) continue;
		0
		0
13462
13463			training_offset = training_input.size();
13464	0	0	training_input.resize(training_offset + sentence.sentence.size());
		0
		0
13465	0	0	training_output.resize(training_offset + sentence.sentence.size());
		0
		0
13466	0	0	for (size_t i = 0; i < sentence.sentence.size(); i++) {
		0
		0
13467	0		training_input[training_offset + i].chr = sentence.sentence[i];
13468	0		training_output[training_offset + i].outcome = gru_tokenizer_network::NO_SPLIT;
13469			}
13470	0	0	for (size_t i = 0; i < sentence.tokens.size(); i++)
		0
		0
13471	0	0	training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome =
		0
		0
13472			i+1 < sentence.tokens.size() ? gru_tokenizer_network::END_OF_TOKEN : gru_tokenizer_network::END_OF_SENTENCE;
13473			}
13474			training_offset = 0;
13475			}
13476			copy_n(training_input.begin() + training_offset, segment, instance_input.begin());
13477			copy_n(training_output.begin() + training_offset, segment, instance_output.begin());
13478
13479			// Shift training_offset
13480	0	0	for (training_shift = segment - 5; training_shift > segment / 2; training_shift--)
		0
		0
13481	0	0	if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT \|\| instance_input[training_shift-1].chr == ' ')
		0
		0
		0
		0
		0
		0
		0
		0
13482			break;
13483	0		training_offset += training_shift;
13484
13485			// Forward pass
13486	0	0	for (unsigned i = 0; i < segment; i++) {
		0
		0
13487	0		chosen_embeddings[i] = &embeddings.at(instance_input[i].chr);
13488	0	0	for (unsigned k = 0; k < D; k++)
		0
		0
13489	0	0	embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier;
		0
		0
		0
		0
		0
13490	0	0	for (int j = 0; j < 3; j++)
		0
		0
13491	0		instance_output[i].w[j] = projection_fwd.original.b[j];
13492			}
13493
13494	0	0	for (int dir = 0; dir < 2; dir++) {
		0
		0
13495	0	0	auto& gru = dir == 0 ? gru_fwd : gru_bwd;
		0
		0
13496	0	0	auto& projection = dir == 0 ? projection_fwd : projection_bwd;
		0
		0
13497
13498			gru.states[0].clear();
13499	0	0	for (size_t i = 0; i < segment; i++) {
		0
		0
13500	0	0	auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
		0
		0
13501	0	0	auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
		0
		0
13502	0	0	auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
		0
		0
13503
13504	0	0	for (int j = 0; j < D; j++) {
		0
		0
13505	0		gru.updates[i].w[0][j] = gru.X_z.original.b[j];
13506	0		gru.resets[i].w[0][j] = gru.X_r.original.b[j];
13507	0	0	for (int k = 0; k < D; k++) {
		0
		0
13508	0		gru.updates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_z.original.w[j][k] + gru.states[i].w[0][k] * gru.H_z.original.w[j][k];
13509	0		gru.resets[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_r.original.w[j][k] + gru.states[i].w[0][k] * gru.H_r.original.w[j][k];
13510			}
13511	0		gru.updates[i].w[0][j] = 1.f / (1.f + exp(-gru.updates[i].w[0][j]));
13512	0		gru.resets[i].w[0][j] = 1.f / (1.f + exp(-gru.resets[i].w[0][j]));
13513	0		gru.resetstates[i].w[0][j] = gru.resets[i].w[0][j] * gru.states[i].w[0][j];
13514			}
13515	0	0	for (int j = 0; j < D; j++) {
		0
		0
13516	0		gru.candidates[i].w[0][j] = gru.X.original.b[j];
13517	0	0	for (int k = 0; k < D; k++)
		0
		0
13518	0		gru.candidates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X.original.w[j][k] + gru.resetstates[i].w[0][k] * gru.H.original.w[j][k];
13519	0		gru.candidates[i].w[0][j] = tanh(gru.candidates[i].w[0][j]);
13520	0		gru.states[i+1].w[0][j] = gru.updates[i].w[0][j] * gru.states[i].w[0][j] + (1.f - gru.updates[i].w[0][j]) * gru.candidates[i].w[0][j];
13521			}
13522
13523	0	0	for (int j = 0; j < D; j++)
		0
		0
13524	0	0	gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j];
		0
		0
		0
		0
		0
13525
13526	0	0	for (int j = 0; j < 3; j++)
		0
		0
13527	0	0	for (int k = 0; k < D; k++)
		0
		0
13528	0		output.w[j] += projection.original.w[j][k] * gru.dropouts[i].w[0][k];
13529			}
13530			}
13531
13532	0	0	for (auto&& output : instance_output) {
		0
		0
13533	0		int best = output.w[1] > output.w[0];
13534	0	0	if (output.w[2] > output.w[best]) best = 2;
		0
		0
13535	0		float maximum = output.w[best], sum = 0;
13536	0	0	for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum));
		0
		0
13537	0		sum = 1.f / sum;
13538	0	0	for (int j = 0; j < 3; j++) output.w[j] *= sum;
		0
		0
13539
13540	0		total++;
13541	0		correct += best == output.outcome;
13542	0		logprob += log(output.w[output.outcome]);
13543			}
13544
13545			// Backward pass
13546	0	0	for (auto&& output : instance_output)
		0
		0
13547	0	0	for (int j = 0; j < 3; j++)
		0
		0
13548	0	0	output.w[j] = (output.outcome == j) - output.w[j];
		0
		0
13549
13550	0	0	for (int dir = 0; dir < 2; dir++) {
		0
		0
13551	0	0	auto& gru = dir == 0 ? gru_fwd : gru_bwd;
		0
		0
13552	0	0	auto& projection = dir == 0 ? projection_fwd : projection_bwd;
		0
		0
13553
13554			matrix<1, D> state_g, update_g, candidate_g, reset_g, resetstate_g;
13555			state_g.clear();
13556	0	0	for (size_t i = segment; i--; ) {
		0
		0
13557	0	0	auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i];
		0
		0
13558	0	0	auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i];
		0
		0
13559	0	0	auto& output = instance_output[dir == 0 ? i : segment - 1 - i];
		0
		0
13560
13561	0	0	for (int j = 0; j < D; j++) // These for cycles are swapped because
		0
		0
13562	0	0	for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise.
		0
		0
13563	0		projection.w_g[k][j] += gru.dropouts[i].w[0][j] * output.w[k];
13564
13565	0	0	for (int j = 0; j < D; j++)
		0
		0
13566	0	0	if (gru.dropouts[i].w[0][j])
		0
		0
13567	0	0	for (int k = 0; k < 3; k++)
		0
		0
13568	0		state_g.w[0][j] += projection.original.w[k][j] * output.w[k];
13569
13570			resetstate_g.clear();
13571	0	0	for (int j = 0; j < D; j++) {
		0
		0
13572	0		update_g.w[0][j] = state_g.w[0][j] * (gru.states[i].w[0][j] - gru.candidates[i].w[0][j]);
13573	0		candidate_g.w[0][j] = state_g.w[0][j] * (1.f - gru.updates[i].w[0][j]);
13574	0		state_g.w[0][j] = state_g.w[0][j] * gru.updates[i].w[0][j];
13575
13576	0		candidate_g.w[0][j] = 1 - gru.candidates[i].w[0][j] gru.candidates[i].w[0][j];
13577	0		gru.X.b_g[j] += candidate_g.w[0][j];
13578	0	0	for (int k = 0; k < D; k++) {
		0
		0
13579	0		gru.X.w_g[j][k] += candidate_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k];
13580	0		gru.H.w_g[j][k] += candidate_g.w[0][j] * gru.resetstates[i].w[0][k];
13581	0		embedding->w_g[0][k] += embedding_dropout.w[0][k] * candidate_g.w[0][j] * gru.X.original.w[j][k];
13582	0		resetstate_g.w[0][k] += candidate_g.w[0][j] * gru.H.original.w[j][k];
13583			}
13584			}
13585	0	0	for (int j = 0; j < D; j++) {
		0
		0
13586	0		state_g.w[0][j] += resetstate_g.w[0][j] * gru.resets[i].w[0][j];
13587	0		reset_g.w[0][j] = resetstate_g.w[0][j] * gru.states[i].w[0][j];
13588
13589	0		update_g.w[0][j] = gru.updates[i].w[0][j] (1 - gru.updates[i].w[0][j]);
13590	0		reset_g.w[0][j] = gru.resets[i].w[0][j] (1 - gru.resets[i].w[0][j]);
13591
13592	0		gru.X_z.b_g[j] += update_g.w[0][j];
13593	0		gru.X_r.b_g[j] += reset_g.w[0][j];
13594	0	0	for (int k = 0; k < D; k++) {
		0
		0
13595	0		gru.X_z.w_g[j][k] += update_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k];
13596	0		gru.H_z.w_g[j][k] += update_g.w[0][j] * gru.states[i].w[0][k];
13597	0		gru.X_r.w_g[j][k] += reset_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k];
13598	0		gru.H_r.w_g[j][k] += reset_g.w[0][j] * gru.states[i].w[0][k];
13599	0		embedding->w_g[0][k] += embedding_dropout.w[0][k] * (update_g.w[0][j] * gru.X_z.original.w[j][k] +
13600	0		reset_g.w[0][j] * gru.X_r.original.w[j][k]);
13601	0		state_g.w[0][k] += update_g.w[0][j] * gru.H_z.original.w[j][k] + reset_g.w[0][j] * gru.H_r.original.w[j][k];
13602			}
13603			}
13604			}
13605			}
13606
13607			// Update the weights
13608	0	0	if (batch_size == 1 \|\|
		0
		0
		0
		0
		0
		0
		0
		0
13609			instance+1 == instances \|\|
13610			(instance+1) % batch_size == 0) {
13611	0		b1t *= 0.9f;
13612	0		b2t *= 0.999f;
13613	0		float learning_rate_biased = learning_rate * sqrt(1-b2t) / (1-b1t);
13614
13615	0	0	if (batch_size == 1)
		0
		0
13616	0	0	for (auto&& chosen_embedding : chosen_embeddings)
		0
		0
13617	0		chosen_embedding->update_weights(learning_rate_biased);
13618			else
13619	0	0	for (auto&& embedding : embeddings)
		0
		0
13620	0		embedding.second.update_weights(learning_rate_biased);
13621	0		gru_fwd.update_weights(learning_rate_biased);
13622	0		gru_bwd.update_weights(learning_rate_biased);
13623	0		projection_fwd.update_weights(learning_rate_biased);
13624	0		projection_bwd.update_weights(learning_rate_biased);
13625			}
13626			}
13627	0	0	if (learning_rate_final && learning_rate_final != learning_rate_initial)
		0
		0
13628	0		learning_rate = exp(((epochs - epoch - 2) * log(learning_rate_initial) + (epoch + 1) * log(learning_rate_final)) / (epochs - 1));
13629
13630			// Evaluate
13631	0	0	cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob
		0
		0
13632	0		<< ", training acc: " << fixed << setprecision(2) << 100. * correct / double(total) << "%";
13633	0	0	if (!heldout.empty()) {
		0
		0
13634			f1_info tokens, sentences;
13635	0	0	evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences);
		0
		0
13636	0	0	cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/"
		0
		0
13637	0		<< 100. * tokens.f1 << "%, sentences: " << 100. * sentences.precision << "%P/"
13638	0		<< 100. * sentences.recall << "%R/" << 100. * sentences.f1 << "%";
13639
13640	0	0	if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) {
		0
		0
		0
		0
		0
13641	0		best_combined_f1 = sentences.f1 + tokens.f1;
13642			best_combined_f1_epoch = epoch;
13643			best_combined_f1_network = *this;
13644			}
13645	0	0	if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) {
		0
		0
		0
		0
		0
13646			cerr << endl << "Stopping after 30 iterations of not improving sum of sentence and token f1." << endl;
13647	0		break;
13648			}
13649			}
13650			cerr << endl;
13651			}
13652
13653			// Choose best network if desired
13654	0	0	if (early_stopping && best_combined_f1) {
		0
		0
13655	0		cerr << "Choosing parameters from epoch " << best_combined_f1_epoch+1 << "." << endl;
13656			this->embeddings = best_combined_f1_network.embeddings;
13657	0		this->gru_fwd = best_combined_f1_network.gru_fwd;
13658	0		this->gru_bwd = best_combined_f1_network.gru_bwd;
13659	0		this->projection_fwd = best_combined_f1_network.projection_fwd;
13660	0		this->projection_bwd = best_combined_f1_network.projection_bwd;
13661			}
13662
13663			// Encode the network
13664	0	0	enc.add_1B(1);
		0
		0
13665	0	0	enc.add_1B(D);
		0
		0
13666
13667	0		enc.add_4B(this->embeddings.size());
13668	0	0	for (auto&& embedding : this->embeddings) {
		0
		0
13669	0		enc.add_4B(embedding.first);
13670	0		enc.add_data(embedding.second.e.w[0], D);
13671			}
13672	0	0	save_gru(this->gru_fwd, enc);
		0
		0
13673	0	0	save_gru(this->gru_bwd, enc);
		0
		0
13674	0	0	save_matrix(this->projection_fwd, enc);
		0
		0
13675	0	0	save_matrix(this->projection_bwd, enc);
		0
		0
13676
13677			return true;
13678			}
13679
13680			template template
13681	0		void gru_tokenizer_network_trainer::matrix_trainer::update_weights(float learning_rate) {
13682	0	0	for (int i = 0; i < R; i++) {
		0
		0
		0
		0
		0
		0
		0
		0
13683	0	0	for (int j = 0; j < C; j++) {
		0
		0
		0
		0
		0
		0
		0
		0
13684	0		w_m[i][j] = 0.9 * w_m[i][j] + (1-0.9) * w_g[i][j];
13685	0		w_v[i][j] = 0.999 * w_v[i][j] + (1-0.999) * w_g[i][j] * w_g[i][j];
13686	0		original.w[i][j] += learning_rate * w_m[i][j] / (sqrt(w_v[i][j]) + 1e-8);
13687			}
13688	0		b_m[i] = 0.9 * b_m[i] + (1-0.9) * b_g[i];
13689	0		b_v[i] = 0.999 * b_v[i] + (1-0.999) * b_g[i] * b_g[i];
13690	0		original.b[i] += learning_rate * b_m[i] / (sqrt(b_v[i]) + 1e-8);
13691			}
13692
13693	0	0	for (int i = 0; i < R; i++) {
		0
		0
		0
		0
		0
		0
		0
		0
13694	0	0	for (int j = 0; j < C; j++)
		0
		0
		0
		0
		0
		0
		0
		0
13695	0		w_g[i][j] = 0.f;
13696	0		b_g[i] = 0.f;
13697			}
13698	0		}
13699
13700			template
13701	0		void gru_tokenizer_network_trainer::gru_trainer::update_weights(float learning_rate) {
13702	0		X.update_weights(learning_rate);
13703	0		X_r.update_weights(learning_rate);
13704	0		X_z.update_weights(learning_rate);
13705	0		H.update_weights(learning_rate);
13706	0		H_r.update_weights(learning_rate);
13707	0		H_z.update_weights(learning_rate);
13708	0		}
13709
13710			template
13711	0		void gru_tokenizer_network_trainer::evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector& heldout,
13712			f1_info& tokens_f1, f1_info& sentences_f1) {
13713			// Generate gold data
13714			vector gold_sentences, gold_tokens;
13715			u32string text;
13716	0	0	for (auto&& sentence : heldout) {
		0
		0
13717	0	0	if (sentence.tokens.empty()) continue;
		0
		0
13718
13719	0	0	gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start);
		0
		0
13720	0	0	for (auto&& token : sentence.tokens)
		0
		0
13721	0	0	gold_tokens.emplace_back(text.size() + token.start, token.length);
		0
		0
13722			text.append(sentence.sentence);
13723			}
13724
13725			// Generate system data
13726			vector system_sentences, system_tokens, tokens;
13727			string text_utf8;
13728
13729	0		this->cache_embeddings();
13730	0		gru_tokenizer tokenizer(url_email_tokenizer, segment, allow_spaces, *this);
13731	0	0	unilib::utf8::encode(text, text_utf8);
		0
		0
13732	0	0	tokenizer.set_text(text_utf8);
		0
		0
13733
13734	0	0	while (tokenizer.next_sentence(tokens))
		0
		0
		0
		0
		0
13735	0	0	if (!tokens.empty()) {
		0
		0
13736	0	0	system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start);
		0
		0
13737	0	0	system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end());
		0
		0
13738			}
13739
13740	0		evaluate_f1(system_tokens, gold_tokens, tokens_f1);
13741	0		evaluate_f1(system_sentences, gold_sentences, sentences_f1);
13742	0		}
13743
13744			template
13745	0		void gru_tokenizer_network_trainer::evaluate_f1(const vector& system, const vector& gold, f1_info& f1) {
13746			size_t both = 0;
13747	0	0	for (size_t si = 0, gi = 0; si < system.size() \|\| gi < gold.size(); )
		0
		0
		0
		0
		0
		0
		0
		0
13748	0	0	if (si < system.size() && (gi == gold.size() \|\| system[si].start < gold[gi].start))
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
13749	0		si++;
13750	0	0	else if (gi < gold.size() && (si == system.size() \|\| gold[gi].start < system[si].start))
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
13751	0		gi++;
13752			else
13753	0		both += system[si++].length == gold[gi++].length;
13754
13755	0	0	f1.precision = system.size() ? both / double(system.size()) : 0.;
		0
		0
13756	0	0	f1.recall = gold.size() ? both / double(gold.size()) : 0.;
		0
		0
13757	0	0	f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.;
		0
		0
13758	0		}
13759
13760			template template
13761	0		void gru_tokenizer_network_trainer::random_matrix(matrix& m, mt19937& generator, float range, float bias) {
13762	0		uniform_real_distribution uniform(-range, range);
13763	0	0	for (int i = 0; i < R; i++) {
		0
		0
		0
		0
		0
		0
		0
		0
13764	0		m.b[i] = bias;
13765	0	0	for (int j = 0; j < C; j++)
		0
		0
		0
		0
		0
		0
		0
		0
13766	0		m.w[i][j] = uniform(generator);
13767			}
13768	0		}
13769
13770			template
13771	0		void gru_tokenizer_network_trainer::random_gru(gru& g, mt19937& generator, float range) {
13772	0		random_matrix(g.X, generator, range, 0.f);
13773	0		random_matrix(g.X_r, generator, range, 1.f);
13774	0		random_matrix(g.X_z, generator, range, 1.f);
13775	0		random_matrix(g.H, generator, range, 0.f);
13776	0		random_matrix(g.H_r, generator, range, 1.f);
13777	0		random_matrix(g.H_z, generator, range, 1.f);
13778	0		}
13779
13780			template template
13781	0		void gru_tokenizer_network_trainer::save_matrix(const matrix& m, binary_encoder& enc) {
13782	0	0	for (int i = 0; i < R; i++)
		0
		0
		0
		0
		0
13783	0		enc.add_data(m.w[i], C);
13784	0		enc.add_data(m.b, R);
13785	0		}
13786
13787			template
13788	0		void gru_tokenizer_network_trainer::save_gru(const gru& g, binary_encoder& enc) {
13789	0		save_matrix(g.X, enc);
13790	0		save_matrix(g.X_r, enc);
13791	0		save_matrix(g.X_z, enc);
13792	0		save_matrix(g.H, enc);
13793	0		save_matrix(g.H_r, enc);
13794	0		save_matrix(g.H_z, enc);
13795	0		}
13796
13797			} // namespace morphodita
13798
13799			/////////
13800			// File: morphodita/tokenizer/gru_tokenizer_trainer.cpp
13801			/////////
13802
13803			// This file is part of MorphoDiTa .
13804			//
13805			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
13806			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13807			//
13808			// This Source Code Form is subject to the terms of the Mozilla Public
13809			// License, v. 2.0. If a copy of the MPL was not distributed with this
13810			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13811
13812			namespace morphodita {
13813
13814	0		bool gru_tokenizer_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs,
13815			unsigned batch_size, float learning_rate, float learning_rate_final, float dropout,
13816			float initialization_range, bool early_stopping, const vector& data,
13817			const vector& heldout, ostream& os, string& error) {
13818			using namespace unilib;
13819
13820			error.clear();
13821
13822			// Start encoding the tokenizer
13823	0		os.put(2);
13824
13825	0		binary_encoder enc;
13826	0	0	enc.add_1B(url_email_tokenizer);
13827	0	0	enc.add_2B(segment);
13828	0	0	enc.add_1B(allow_spaces);
13829
13830			// Train the GRU network
13831	0	0	if (dimension == 16) {
13832			gru_tokenizer_network_trainer<16> network;
13833	0	0	if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
		0
13834			dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false;
13835	0	0	} else if (dimension == 24) {
13836			gru_tokenizer_network_trainer<24> network;
13837	0	0	if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
		0
13838			dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false;
13839	0	0	} else if (dimension == 64) {
13840			gru_tokenizer_network_trainer<64> network;
13841	0	0	if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final,
		0
13842			dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false;
13843			} else {
13844	0	0	return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false;
		0
13845			}
13846
13847			// Compute best substitutions for every category
13848			unordered_map> counts;
13849	0	0	for (auto&& sentence : data)
13850	0	0	for (auto&& chr : sentence.sentence)
13851	0		counts[unicode::category(chr)][chr]++;
13852
13853			unordered_map unknown_chars;
13854	0	0	for (auto&& count : counts) {
13855	0		char32_t best_chr = 0;
13856			unsigned best = 0;
13857	0	0	for (auto&& chr : count.second)
13858	0	0	if (chr.second > best)
13859	0		best = chr.second, best_chr = chr.first;
13860	0	0	if (best_chr)
13861	0		unknown_chars.emplace(count.first, best_chr);
13862			}
13863	0	0	enc.add_1B(unknown_chars.size());
13864	0	0	for (auto&& unknown_char : unknown_chars) {
13865	0		enc.add_4B(unknown_char.first);
13866	0		enc.add_4B(unknown_char.second);
13867			}
13868
13869	0	0	if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false;
		0
		0
13870			return true;
13871			}
13872
13873			} // namespace morphodita
13874
13875			/////////
13876			// File: morphodita/tokenizer/ragel_tokenizer.cpp
13877			/////////
13878
13879			// This file is part of MorphoDiTa .
13880			//
13881			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
13882			// Mathematics and Physics, Charles University in Prague, Czech Republic.
13883			//
13884			// This Source Code Form is subject to the terms of the Mozilla Public
13885			// License, v. 2.0. If a copy of the MPL was not distributed with this
13886			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
13887
13888			namespace morphodita {
13889
13890			static const char _ragel_url_email_cond_offsets[] = {
13891			0, 0, 0, 0, 0, 0, 0, 0,
13892			0, 0, 0, 0, 0, 1, 1, 1,
13893			1, 1, 1, 1, 1, 1, 1, 1,
13894			1, 1, 1, 1, 1, 1, 1, 1,
13895			1, 1, 1, 1, 1, 1, 1, 1,
13896			1, 1, 1, 1, 1, 1, 1, 1,
13897			1, 1, 1, 1, 1, 1, 1, 1,
13898			1, 1, 1, 2, 3, 3, 4, 5,
13899			6, 7, 8, 9, 10, 11, 12, 13,
13900			14, 15, 16
13901			};
13902
13903			static const char _ragel_url_email_cond_lengths[] = {
13904			0, 0, 0, 0, 0, 0, 0, 0,
13905			0, 0, 0, 0, 1, 0, 0, 0,
13906			0, 0, 0, 0, 0, 0, 0, 0,
13907			0, 0, 0, 0, 0, 0, 0, 0,
13908			0, 0, 0, 0, 0, 0, 0, 0,
13909			0, 0, 0, 0, 0, 0, 0, 0,
13910			0, 0, 0, 0, 0, 0, 0, 0,
13911			0, 0, 1, 1, 0, 1, 1, 1,
13912			1, 1, 1, 1, 1, 1, 1, 1,
13913			1, 1, 1
13914			};
13915
13916			static const short _ragel_url_email_cond_keys[] = {
13917			41u, 41u, 47u, 47u, 47u, 47u, 41u, 41u,
13918			47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u,
13919			47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u,
13920			47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u,
13921			47u, 47u, 0
13922			};
13923
13924			static const char _ragel_url_email_cond_spaces[] = {
13925			1, 0, 0, 1, 0, 0, 0, 0,
13926			0, 0, 0, 0, 0, 0, 0, 0,
13927			0, 0
13928			};
13929
13930			static const short _ragel_url_email_key_offsets[] = {
13931			0, 0, 15, 29, 41, 54, 63, 71,
13932			78, 86, 92, 100, 117, 145, 154, 162,
13933			171, 179, 188, 196, 204, 215, 225, 233,
13934			241, 252, 262, 270, 278, 289, 299, 315,
13935			330, 346, 360, 376, 393, 409, 426, 442,
13936			459, 475, 491, 510, 528, 544, 560, 579,
13937			597, 613, 629, 648, 666, 682, 698, 714,
13938			725, 726, 741, 752, 756, 773, 801, 812,
13939			823, 834, 848, 861, 879, 893, 908, 926,
13940			944, 962, 983
13941			};
13942
13943			static const short _ragel_url_email_trans_keys[] = {
13944			33u, 48u, 49u, 50u, 95u, 36u, 37u, 39u,
13945			46u, 51u, 57u, 65u, 90u, 97u, 122u, 33u,
13946			58u, 64u, 95u, 36u, 37u, 39u, 46u, 48u,
13947			57u, 65u, 90u, 97u, 122u, 33u, 95u, 36u,
13948			37u, 39u, 46u, 48u, 57u, 65u, 90u, 97u,
13949			122u, 33u, 64u, 95u, 36u, 37u, 39u, 46u,
13950			48u, 57u, 65u, 90u, 97u, 122u, 48u, 49u,
13951			50u, 51u, 57u, 65u, 90u, 97u, 122u, 45u,
13952			46u, 48u, 57u, 65u, 90u, 97u, 122u, 45u,
13953			48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13954			48u, 57u, 65u, 90u, 97u, 122u, 48u, 57u,
13955			65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u,
13956			65u, 90u, 97u, 122u, 33u, 39u, 41u, 61u,
13957			95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u,
13958			64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u,
13959			44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u,
13960			151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u,
13961			59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u,
13962			159u, 48u, 49u, 50u, 51u, 57u, 65u, 90u,
13963			97u, 122u, 45u, 46u, 48u, 57u, 65u, 90u,
13964			97u, 122u, 48u, 49u, 50u, 51u, 57u, 65u,
13965			90u, 97u, 122u, 45u, 46u, 48u, 57u, 65u,
13966			90u, 97u, 122u, 48u, 49u, 50u, 51u, 57u,
13967			65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u,
13968			65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u,
13969			65u, 90u, 97u, 122u, 45u, 46u, 53u, 48u,
13970			52u, 54u, 57u, 65u, 90u, 97u, 122u, 45u,
13971			46u, 48u, 53u, 54u, 57u, 65u, 90u, 97u,
13972			122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u,
13973			122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u,
13974			122u, 45u, 46u, 53u, 48u, 52u, 54u, 57u,
13975			65u, 90u, 97u, 122u, 45u, 46u, 48u, 53u,
13976			54u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13977			48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13978			48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u,
13979			53u, 48u, 52u, 54u, 57u, 65u, 90u, 97u,
13980			122u, 45u, 46u, 48u, 53u, 54u, 57u, 65u,
13981			90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
13982			95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u,
13983			90u, 97u, 122u, 33u, 45u, 58u, 64u, 95u,
13984			36u, 37u, 39u, 46u, 48u, 57u, 65u, 90u,
13985			97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13986			36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13987			97u, 122u, 33u, 58u, 64u, 95u, 36u, 37u,
13988			39u, 46u, 48u, 57u, 65u, 90u, 97u, 122u,
13989			33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
13990			39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
13991			33u, 48u, 49u, 50u, 58u, 64u, 95u, 36u,
13992			37u, 39u, 46u, 51u, 57u, 65u, 90u, 97u,
13993			122u, 33u, 45u, 46u, 58u, 64u, 95u, 36u,
13994			37u, 39u, 44u, 48u, 57u, 65u, 90u, 97u,
13995			122u, 33u, 48u, 49u, 50u, 58u, 64u, 95u,
13996			36u, 37u, 39u, 46u, 51u, 57u, 65u, 90u,
13997			97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
13998			36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
13999			97u, 122u, 33u, 48u, 49u, 50u, 58u, 64u,
14000			95u, 36u, 37u, 39u, 46u, 51u, 57u, 65u,
14001			90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
14002			95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u,
14003			90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
14004			95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u,
14005			90u, 97u, 122u, 33u, 45u, 46u, 53u, 58u,
14006			64u, 95u, 36u, 37u, 39u, 44u, 48u, 52u,
14007			54u, 57u, 65u, 90u, 97u, 122u, 33u, 45u,
14008			46u, 58u, 64u, 95u, 36u, 37u, 39u, 44u,
14009			48u, 53u, 54u, 57u, 65u, 90u, 97u, 122u,
14010			33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
14011			39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
14012			33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
14013			39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
14014			33u, 45u, 46u, 53u, 58u, 64u, 95u, 36u,
14015			37u, 39u, 44u, 48u, 52u, 54u, 57u, 65u,
14016			90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u,
14017			95u, 36u, 37u, 39u, 44u, 48u, 53u, 54u,
14018			57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
14019			58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u,
14020			57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
14021			58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u,
14022			57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
14023			53u, 58u, 64u, 95u, 36u, 37u, 39u, 44u,
14024			48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u,
14025			33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u,
14026			39u, 44u, 48u, 53u, 54u, 57u, 65u, 90u,
14027			97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
14028			36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
14029			97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
14030			36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
14031			97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u,
14032			36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
14033			97u, 122u, 33u, 47u, 95u, 36u, 37u, 39u,
14034			57u, 65u, 90u, 97u, 122u, 47u, 33u, 48u,
14035			49u, 50u, 95u, 36u, 37u, 39u, 46u, 51u,
14036			57u, 65u, 90u, 97u, 122u, 45u, 46u, 58u,
14037			303u, 559u, 48u, 57u, 65u, 90u, 97u, 122u,
14038			303u, 559u, 48u, 57u, 33u, 39u, 41u, 61u,
14039			95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u,
14040			64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u,
14041			44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u,
14042			151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u,
14043			59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u,
14044			159u, 45u, 46u, 58u, 303u, 559u, 48u, 57u,
14045			65u, 90u, 97u, 122u, 45u, 46u, 58u, 303u,
14046			559u, 48u, 57u, 65u, 90u, 97u, 122u, 45u,
14047			46u, 58u, 303u, 559u, 48u, 57u, 65u, 90u,
14048			97u, 122u, 45u, 46u, 53u, 58u, 303u, 559u,
14049			48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u,
14050			45u, 46u, 58u, 303u, 559u, 48u, 53u, 54u,
14051			57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u,
14052			58u, 64u, 95u, 303u, 559u, 36u, 37u, 39u,
14053			44u, 48u, 57u, 65u, 90u, 97u, 122u, 33u,
14054			95u, 303u, 559u, 36u, 37u, 39u, 46u, 48u,
14055			57u, 65u, 90u, 97u, 122u, 33u, 64u, 95u,
14056			303u, 559u, 36u, 37u, 39u, 46u, 48u, 57u,
14057			65u, 90u, 97u, 122u, 33u, 45u, 46u, 58u,
14058			64u, 95u, 303u, 559u, 36u, 37u, 39u, 44u,
14059			48u, 57u, 65u, 90u, 97u, 122u, 33u, 45u,
14060			46u, 58u, 64u, 95u, 303u, 559u, 36u, 37u,
14061			39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u,
14062			33u, 45u, 46u, 58u, 64u, 95u, 303u, 559u,
14063			36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u,
14064			97u, 122u, 33u, 45u, 46u, 53u, 58u, 64u,
14065			95u, 303u, 559u, 36u, 37u, 39u, 44u, 48u,
14066			52u, 54u, 57u, 65u, 90u, 97u, 122u, 33u,
14067			45u, 46u, 58u, 64u, 95u, 303u, 559u, 36u,
14068			37u, 39u, 44u, 48u, 53u, 54u, 57u, 65u,
14069			90u, 97u, 122u, 0
14070			};
14071
14072			static const char _ragel_url_email_single_lengths[] = {
14073			0, 5, 4, 2, 3, 3, 2, 1,
14074			2, 0, 2, 5, 14, 3, 2, 3,
14075			2, 3, 2, 2, 3, 2, 2, 2,
14076			3, 2, 2, 2, 3, 2, 6, 5,
14077			6, 4, 6, 7, 6, 7, 6, 7,
14078			6, 6, 7, 6, 6, 6, 7, 6,
14079			6, 6, 7, 6, 6, 6, 6, 3,
14080			1, 5, 5, 2, 5, 14, 5, 5,
14081			5, 6, 5, 8, 4, 5, 8, 8,
14082			8, 9, 8
14083			};
14084
14085			static const char _ragel_url_email_range_lengths[] = {
14086			0, 5, 5, 5, 5, 3, 3, 3,
14087			3, 3, 3, 6, 7, 3, 3, 3,
14088			3, 3, 3, 3, 4, 4, 3, 3,
14089			4, 4, 3, 3, 4, 4, 5, 5,
14090			5, 5, 5, 5, 5, 5, 5, 5,
14091			5, 5, 6, 6, 5, 5, 6, 6,
14092			5, 5, 6, 6, 5, 5, 5, 4,
14093			0, 5, 3, 1, 6, 7, 3, 3,
14094			3, 4, 4, 5, 5, 5, 5, 5,
14095			5, 6, 6
14096			};
14097
14098			static const short _ragel_url_email_index_offsets[] = {
14099			0, 0, 11, 21, 29, 38, 45, 51,
14100			56, 62, 66, 72, 84, 106, 113, 119,
14101			126, 132, 139, 145, 151, 159, 166, 172,
14102			178, 186, 193, 199, 205, 213, 220, 232,
14103			243, 255, 265, 277, 290, 302, 315, 327,
14104			340, 352, 364, 378, 391, 403, 415, 429,
14105			442, 454, 466, 480, 493, 505, 517, 529,
14106			537, 539, 550, 559, 563, 575, 597, 606,
14107			615, 624, 635, 645, 659, 669, 680, 694,
14108			708, 722, 738
14109			};
14110
14111			static const char _ragel_url_email_indicies[] = {
14112			0, 2, 3, 4, 0, 0, 0, 5,
14113			6, 6, 1, 0, 7, 8, 0, 0,
14114			0, 0, 0, 0, 1, 9, 9, 9,
14115			9, 9, 9, 9, 1, 9, 8, 9,
14116			9, 9, 9, 9, 9, 1, 10, 11,
14117			12, 13, 14, 14, 1, 15, 16, 14,
14118			14, 14, 1, 15, 14, 14, 14, 1,
14119			15, 17, 14, 14, 14, 1, 14, 18,
14120			18, 1, 15, 17, 14, 19, 19, 1,
14121			20, 21, 21, 20, 20, 20, 21, 20,
14122			20, 21, 21, 1, 22, 22, 24, 22,
14123			22, 23, 22, 23, 23, 23, 23, 23,
14124			25, 26, 23, 23, 22, 23, 23, 23,
14125			23, 1, 27, 28, 29, 30, 18, 18,
14126			1, 15, 31, 14, 14, 14, 1, 32,
14127			33, 34, 35, 18, 18, 1, 15, 36,
14128			14, 14, 14, 1, 37, 38, 39, 40,
14129			18, 18, 1, 15, 36, 35, 14, 14,
14130			1, 15, 36, 32, 14, 14, 1, 15,
14131			36, 41, 35, 32, 14, 14, 1, 15,
14132			36, 32, 14, 14, 14, 1, 15, 31,
14133			30, 14, 14, 1, 15, 31, 27, 14,
14134			14, 1, 15, 31, 42, 30, 27, 14,
14135			14, 1, 15, 31, 27, 14, 14, 14,
14136			1, 15, 16, 13, 14, 14, 1, 15,
14137			16, 10, 14, 14, 1, 15, 16, 43,
14138			13, 10, 14, 14, 1, 15, 16, 10,
14139			14, 14, 14, 1, 0, 44, 45, 7,
14140			8, 0, 0, 0, 46, 46, 46, 1,
14141			0, 44, 7, 8, 0, 0, 0, 46,
14142			46, 46, 1, 0, 44, 47, 7, 8,
14143			0, 0, 0, 46, 46, 46, 1, 0,
14144			7, 8, 0, 0, 0, 46, 48, 48,
14145			1, 0, 44, 47, 7, 8, 0, 0,
14146			0, 46, 49, 49, 1, 0, 50, 51,
14147			52, 7, 8, 0, 0, 0, 53, 48,
14148			48, 1, 0, 44, 54, 7, 8, 0,
14149			0, 0, 46, 46, 46, 1, 0, 55,
14150			56, 57, 7, 8, 0, 0, 0, 58,
14151			48, 48, 1, 0, 44, 59, 7, 8,
14152			0, 0, 0, 46, 46, 46, 1, 0,
14153			60, 61, 62, 7, 8, 0, 0, 0,
14154			63, 48, 48, 1, 0, 44, 59, 7,
14155			8, 0, 0, 0, 58, 46, 46, 1,
14156			0, 44, 59, 7, 8, 0, 0, 0,
14157			55, 46, 46, 1, 0, 44, 59, 64,
14158			7, 8, 0, 0, 0, 58, 55, 46,
14159			46, 1, 0, 44, 59, 7, 8, 0,
14160			0, 0, 55, 46, 46, 46, 1, 0,
14161			44, 54, 7, 8, 0, 0, 0, 53,
14162			46, 46, 1, 0, 44, 54, 7, 8,
14163			0, 0, 0, 50, 46, 46, 1, 0,
14164			44, 54, 65, 7, 8, 0, 0, 0,
14165			53, 50, 46, 46, 1, 0, 44, 54,
14166			7, 8, 0, 0, 0, 50, 46, 46,
14167			46, 1, 0, 44, 45, 7, 8, 0,
14168			0, 0, 5, 46, 46, 1, 0, 44,
14169			45, 7, 8, 0, 0, 0, 2, 46,
14170			46, 1, 0, 44, 45, 66, 7, 8,
14171			0, 0, 0, 5, 2, 46, 46, 1,
14172			0, 44, 45, 7, 8, 0, 0, 0,
14173			2, 46, 46, 46, 1, 0, 44, 47,
14174			7, 8, 0, 0, 0, 46, 67, 67,
14175			1, 0, 44, 47, 7, 8, 0, 0,
14176			0, 46, 68, 68, 1, 0, 44, 47,
14177			69, 8, 0, 0, 0, 46, 68, 68,
14178			1, 9, 70, 9, 9, 9, 9, 9,
14179			1, 71, 1, 0, 2, 3, 4, 0,
14180			0, 0, 5, 46, 46, 1, 15, 17,
14181			72, 21, 23, 14, 19, 19, 1, 21,
14182			23, 72, 1, 20, 21, 21, 20, 20,
14183			20, 21, 20, 20, 21, 21, 1, 22,
14184			22, 24, 22, 22, 23, 22, 23, 23,
14185			23, 23, 23, 25, 26, 23, 23, 22,
14186			23, 23, 23, 23, 1, 15, 17, 72,
14187			21, 23, 14, 14, 14, 1, 15, 17,
14188			72, 21, 23, 40, 14, 14, 1, 15,
14189			17, 72, 21, 23, 37, 14, 14, 1,
14190			15, 17, 73, 72, 21, 23, 40, 37,
14191			14, 14, 1, 15, 17, 72, 21, 23,
14192			37, 14, 14, 14, 1, 0, 44, 47,
14193			74, 8, 0, 21, 23, 0, 0, 46,
14194			49, 49, 1, 9, 9, 21, 23, 9,
14195			9, 75, 9, 9, 1, 9, 8, 9,
14196			21, 23, 9, 9, 75, 9, 9, 1,
14197			0, 44, 47, 74, 8, 0, 21, 23,
14198			0, 0, 46, 46, 46, 1, 0, 44,
14199			47, 74, 8, 0, 21, 23, 0, 0,
14200			63, 46, 46, 1, 0, 44, 47, 74,
14201			8, 0, 21, 23, 0, 0, 60, 46,
14202			46, 1, 0, 44, 47, 76, 74, 8,
14203			0, 21, 23, 0, 0, 63, 60, 46,
14204			46, 1, 0, 44, 47, 74, 8, 0,
14205			21, 23, 0, 0, 60, 46, 46, 46,
14206			1, 0
14207			};
14208
14209			static const char _ragel_url_email_trans_targs[] = {
14210			2, 0, 30, 48, 50, 49, 52, 3,
14211			5, 4, 6, 26, 28, 27, 8, 7,
14212			13, 9, 10, 58, 11, 60, 12, 61,
14213			61, 12, 61, 14, 22, 24, 23, 15,
14214			16, 18, 20, 19, 17, 62, 63, 65,
14215			64, 21, 25, 29, 31, 35, 32, 33,
14216			34, 67, 36, 44, 46, 45, 37, 38,
14217			40, 42, 41, 39, 70, 71, 73, 72,
14218			43, 47, 51, 53, 54, 55, 56, 57,
14219			59, 66, 68, 69, 74
14220			};
14221
14222			static const char _ragel_url_email_trans_actions[] = {
14223			0, 0, 0, 0, 0, 0, 0, 0,
14224			0, 0, 0, 0, 0, 0, 0, 0,
14225			0, 0, 0, 1, 0, 1, 0, 1,
14226			2, 3, 4, 0, 0, 0, 0, 0,
14227			0, 0, 0, 0, 0, 1, 1, 1,
14228			1, 0, 0, 0, 0, 0, 0, 0,
14229			0, 1, 0, 0, 0, 0, 0, 0,
14230			0, 0, 0, 0, 1, 1, 1, 1,
14231			0, 0, 0, 0, 0, 0, 0, 0,
14232			1, 1, 1, 1, 1
14233			};
14234
14235			static const int ragel_url_email_start = 1;
14236
14237	2		vector ragel_tokenizer::ragel_map;
14238			atomic_flag ragel_tokenizer::ragel_map_flag = ATOMIC_FLAG_INIT;
14239
14240	0		ragel_tokenizer::ragel_tokenizer(unsigned url_email_tokenizer) : unicode_tokenizer(url_email_tokenizer) {
14241	0	0	initialize_ragel_map();
14242	0		}
14243
14244	2		void ragel_tokenizer::initialize_ragel_map() {
14245	1	50	while (ragel_map_flag.test_and_set()) {}
14246	1	50	if (ragel_map.empty()) {
14247	129	100	for (uint8_t ascii = 0; ascii < 128; ascii++)
14248	128		ragel_map.push_back(ascii);
14249
14250	1		ragel_map_add(U'\u2026', 160); // horizontal ellipsis (TRIPLE DOT)
14251	1		ragel_map_add(U'\u2019', 161); // right single quotation mark
14252	1		ragel_map_add(U'\u2018', 162); // left single quotation mark
14253	1		ragel_map_add(U'\u2010', 163); // hyphen
14254			}
14255			ragel_map_flag.clear();
14256	1		}
14257
14258	4		void ragel_tokenizer::ragel_map_add(char32_t chr, uint8_t mapping) {
14259	4	100	if (chr >= ragel_map.size())
14260	1		ragel_map.resize(chr + 1, 128);
14261	4		ragel_map[chr] = mapping;
14262	4		}
14263
14264	7		bool ragel_tokenizer::ragel_url_email(unsigned version, const vector& chars, size_t& current, vector& tokens) {
14265			int cs;
14266
14267	7		size_t start = current, end = current, parens = 0;
14268
14269			{
14270			cs = ragel_url_email_start;
14271			}
14272
14273			{
14274			int _klen;
14275			const short *_keys;
14276			int _trans;
14277			short _widec;
14278
14279	7	50	if ( ( current) == ( (chars.size() - 1)) )
14280			goto _test_eof;
14281			if ( cs == 0 )
14282			goto _out;
14283			_resume:
14284	60		_widec = ( ragel_char(chars[current]));
14285	30		_klen = _ragel_url_email_cond_lengths[cs];
14286	30		_keys = _ragel_url_email_cond_keys + (_ragel_url_email_cond_offsets[cs]*2);
14287	30	50	if ( _klen > 0 ) {
14288			const short *_lower = _keys;
14289			const short *_mid;
14290	0		const short *_upper = _keys + (_klen<<1) - 2;
14291			while (1) {
14292	0	0	if ( _upper < _lower )
14293			break;
14294
14295	0		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
14296	0	0	if ( _widec < _mid[0] )
14297	0		_upper = _mid - 2;
14298	0	0	else if ( _widec > _mid[1] )
14299	0		_lower = _mid + 2;
14300			else {
14301	0		switch ( _ragel_url_email_cond_spaces[_ragel_url_email_cond_offsets[cs] + ((_mid - _keys)>>1)] ) {
14302			case 0: {
14303	0		_widec = (short)(256u + (( ragel_char(chars[current])) - 0u));
14304	0	0	if (
14305	0		version >= 2 ) _widec += 256;
14306			break;
14307			}
14308			case 1: {
14309	0		_widec = (short)(768u + (( ragel_char(chars[current])) - 0u));
14310	0	0	if (
14311	0		parens ) _widec += 256;
14312			break;
14313			}
14314			}
14315			break;
14316			}
14317			}
14318			}
14319
14320	30		_keys = _ragel_url_email_trans_keys + _ragel_url_email_key_offsets[cs];
14321	30		_trans = _ragel_url_email_index_offsets[cs];
14322
14323	30		_klen = _ragel_url_email_single_lengths[cs];
14324	30	50	if ( _klen > 0 ) {
14325			const short *_lower = _keys;
14326			const short *_mid;
14327	117		const short *_upper = _keys + _klen - 1;
14328			while (1) {
14329	117	100	if ( _upper < _lower )
14330			break;
14331
14332	87		_mid = _lower + ((_upper-_lower) >> 1);
14333	87	100	if ( _widec < *_mid )
14334	13		_upper = _mid - 1;
14335	74	50	else if ( _widec > *_mid )
14336	74		_lower = _mid + 1;
14337			else {
14338	0		_trans += (unsigned int)(_mid - _keys);
14339	0		goto _match;
14340			}
14341			}
14342	30		_keys += _klen;
14343	30		_trans += _klen;
14344			}
14345
14346	30		_klen = _ragel_url_email_range_lengths[cs];
14347	30	50	if ( _klen > 0 ) {
14348			const short *_lower = _keys;
14349			const short *_mid;
14350	93		const short *_upper = _keys + (_klen<<1) - 2;
14351			while (1) {
14352	93	100	if ( _upper < _lower )
14353			break;
14354
14355	86		_mid = _lower + (((_upper-_lower) >> 1) & ~1);
14356	86	100	if ( _widec < _mid[0] )
14357	9		_upper = _mid - 2;
14358	77	100	else if ( _widec > _mid[1] )
14359	54		_lower = _mid + 2;
14360			else {
14361	23		_trans += (unsigned int)((_mid - _keys)>>1);
14362	23		goto _match;
14363			}
14364			}
14365	7		_trans += _klen;
14366			}
14367
14368			_match:
14369	30		_trans = _ragel_url_email_indicies[_trans];
14370	30		cs = _ragel_url_email_trans_targs[_trans];
14371
14372	30	50	if ( _ragel_url_email_trans_actions[_trans] == 0 )
14373			goto _again;
14374
14375	0		switch ( _ragel_url_email_trans_actions[_trans] ) {
14376			case 3:
14377	0		{parens-=!!parens;}
14378	0		break;
14379			case 1:
14380	0		{ end = current + 1; }
14381	0		break;
14382			case 2:
14383	0		{parens++;}
14384	0		{ end = current + 1; }
14385	0		break;
14386			case 4:
14387	0		{parens-=!!parens;}
14388	0		{ end = current + 1; }
14389	0		break;
14390			}
14391
14392			_again:
14393	30	100	if ( cs == 0 )
14394			goto _out;
14395	23	50	if ( ++( current) != ( (chars.size() - 1)) )
14396			goto _resume;
14397			_test_eof: {}
14398			_out: {}
14399			}
14400
14401	7	50	if (end > start) {
14402	0		tokens.emplace_back(start, end - start);
14403	0		current = end;
14404	0		return true;
14405			} else {
14406	7		current = start;
14407	7		return false;
14408			}
14409			}
14410
14411			} // namespace morphodita
14412
14413			/////////
14414			// File: morphodita/tokenizer/vertical_tokenizer.h
14415			/////////
14416
14417			// This file is part of MorphoDiTa .
14418			//
14419			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14420			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14421			//
14422			// This Source Code Form is subject to the terms of the Mozilla Public
14423			// License, v. 2.0. If a copy of the MPL was not distributed with this
14424			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14425
14426			namespace morphodita {
14427
14428	0		class vertical_tokenizer : public unicode_tokenizer {
14429			public:
14430	0	0	vertical_tokenizer() : unicode_tokenizer(0) {}
14431
14432			virtual bool next_sentence(vector& tokens) override;
14433			};
14434
14435			} // namespace morphodita
14436
14437			/////////
14438			// File: morphodita/tokenizer/tokenizer.cpp
14439			/////////
14440
14441			// This file is part of MorphoDiTa .
14442			//
14443			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14444			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14445			//
14446			// This Source Code Form is subject to the terms of the Mozilla Public
14447			// License, v. 2.0. If a copy of the MPL was not distributed with this
14448			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14449
14450			namespace morphodita {
14451
14452	0		tokenizer* tokenizer::new_vertical_tokenizer() {
14453	0		return new vertical_tokenizer();
14454			}
14455
14456	0		tokenizer* tokenizer::new_czech_tokenizer() {
14457	0		return new czech_tokenizer(czech_tokenizer::CZECH, czech_tokenizer::LATEST);
14458			}
14459
14460	0		tokenizer* tokenizer::new_english_tokenizer() {
14461	0		return new english_tokenizer(english_tokenizer::LATEST);
14462			}
14463
14464	0		tokenizer* tokenizer::new_generic_tokenizer() {
14465	0		return new generic_tokenizer(generic_tokenizer::LATEST);
14466			}
14467
14468			} // namespace morphodita
14469
14470			/////////
14471			// File: morphodita/tokenizer/tokenizer_ids.h
14472			/////////
14473
14474			// This file is part of MorphoDiTa .
14475			//
14476			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14477			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14478			//
14479			// This Source Code Form is subject to the terms of the Mozilla Public
14480			// License, v. 2.0. If a copy of the MPL was not distributed with this
14481			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14482
14483			namespace morphodita {
14484
14485			class tokenizer_ids {
14486			public:
14487			enum tokenizer_id {
14488			CZECH = 0,
14489			ENGLISH = 1,
14490			GENERIC = 2,
14491			GRU = 3,
14492			};
14493
14494			static bool parse(const string& str, tokenizer_id& id) {
14495			if (str == "czech") return id = CZECH, true;
14496			if (str == "english") return id = ENGLISH, true;
14497			if (str == "generic") return id = GENERIC, true;
14498			if (str == "gru") return id = GRU, true;
14499			return false;
14500			}
14501			};
14502
14503			typedef tokenizer_ids::tokenizer_id tokenizer_id;
14504
14505			} // namespace morphodita
14506
14507			/////////
14508			// File: morphodita/tokenizer/tokenizer_factory.cpp
14509			/////////
14510
14511			// This file is part of MorphoDiTa .
14512			//
14513			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14514			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14515			//
14516			// This Source Code Form is subject to the terms of the Mozilla Public
14517			// License, v. 2.0. If a copy of the MPL was not distributed with this
14518			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14519
14520			namespace morphodita {
14521
14522	1		tokenizer_factory* tokenizer_factory::load(istream& is) {
14523	1		tokenizer_id id = tokenizer_id(is.get());
14524	1		switch (id) {
14525			case tokenizer_ids::GENERIC:
14526			{
14527			auto res = new_unique_ptr();
14528	0	0	if (res->load(is)) return res.release();
14529			break;
14530			}
14531			case tokenizer_ids::GRU:
14532			{
14533	1		auto res = new_unique_ptr();
14534	1	50	if (res->load(is)) return res.release();
		50
14535			break;
14536			}
14537			case tokenizer_ids::CZECH:
14538			{
14539			auto res = new_unique_ptr();
14540	0	0	if (res->load(is)) return res.release();
		0
14541			break;
14542			}
14543			case tokenizer_ids::ENGLISH:
14544			break;
14545			}
14546
14547			return nullptr;
14548			}
14549
14550	0		tokenizer_factory* tokenizer_factory::load(const char* fname) {
14551	0	0	ifstream f(path_from_utf8(fname).c_str(), ifstream::binary);
14552	0	0	if (!f) return nullptr;
14553
14554	0	0	return load(f);
14555			}
14556
14557			} // namespace morphodita
14558
14559			/////////
14560			// File: morphodita/tokenizer/unicode_tokenizer.cpp
14561			/////////
14562
14563			// This file is part of MorphoDiTa .
14564			//
14565			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14566			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14567			//
14568			// This Source Code Form is subject to the terms of the Mozilla Public
14569			// License, v. 2.0. If a copy of the MPL was not distributed with this
14570			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14571
14572			namespace morphodita {
14573
14574	1		unicode_tokenizer::unicode_tokenizer(unsigned url_email_tokenizer) : url_email_tokenizer(url_email_tokenizer) {
14575	1	50	ragel_tokenizer::initialize_ragel_map();
14576
14577	1	50	set_text(string_piece(nullptr, 0));
14578	1		}
14579
14580	2		void unicode_tokenizer::set_text(string_piece text, bool make_copy /= false/) {
14581			using namespace unilib;
14582
14583	2	50	if (make_copy && text.str) {
		0
14584	0		text_buffer.assign(text.str, text.len);
14585	0		text.str = text_buffer.c_str();
14586			}
14587	2		current = 0;
14588
14589			chars.clear();
14590	36	100	for (const char* curr_str = text.str; text.len; curr_str = text.str)
14591	34		chars.emplace_back(utf8::decode(text.str, text.len), curr_str);
14592	2		chars.emplace_back(0, text.str);
14593	2		}
14594
14595	2		bool unicode_tokenizer::next_sentence(vector* forms, vector* tokens_ptr) {
14596	2	50	vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer;
14597			tokens.clear();
14598	2	50	if (forms) forms->clear();
14599	2	50	if (current >= chars.size() - 1) return false;
14600
14601	2		bool result = next_sentence(tokens);
14602	2	50	if (forms)
14603	9	100	for (auto&& token : tokens)
14604	7		forms->emplace_back(chars[token.start].str, chars[token.start + token.length].str - chars[token.start].str);
14605
14606			return result;
14607			}
14608
14609	7		bool unicode_tokenizer::tokenize_url_email(vector& tokens) {
14610	7	50	if (current >= chars.size() - 1) return false;
14611
14612	7	50	return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false;
14613			}
14614
14615	8		bool unicode_tokenizer::emergency_sentence_split(const vector& tokens) {
14616			using namespace unilib;
14617
14618			// Implement emergency splitting for large sentences
14619	8	50	return tokens.size() >= 500 \|\|
14620	16	50	(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) \|\|
		0
		50
14621	0	0	(tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po);
14622			}
14623
14624	0		bool unicode_tokenizer::is_eos(const vector& tokens, char32_t eos_chr, const unordered_set* abbreviations) {
14625			using namespace unilib;
14626
14627	0	0	if (eos_chr == '.' && !tokens.empty()) {
		0
		0
14628			// Ignore one-letter capitals before dot
14629	0	0	if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut)
		0
		0
14630			return false;
14631
14632			// Ignore specified abbreviations
14633	0	0	if (abbreviations) {
14634			eos_buffer.clear();
14635	0	0	for (size_t i = 0; i < tokens.back().length; i++)
14636	0		utf8::append(eos_buffer, unicode::lowercase(chars[tokens.back().start + i].chr));
14637	0	0	if (abbreviations->count(eos_buffer))
14638			return false;
14639			}
14640			}
14641			return true;
14642			}
14643
14644			} // namespace morphodita
14645
14646			/////////
14647			// File: morphodita/tokenizer/vertical_tokenizer.cpp
14648			/////////
14649
14650			// This file is part of MorphoDiTa .
14651			//
14652			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14653			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14654			//
14655			// This Source Code Form is subject to the terms of the Mozilla Public
14656			// License, v. 2.0. If a copy of the MPL was not distributed with this
14657			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14658
14659			namespace morphodita {
14660
14661	0		bool vertical_tokenizer::next_sentence(vector& tokens) {
14662	0	0	if (current >= chars.size() - 1) return false;
14663
14664	0		while (true) {
14665	0		size_t line_start = current;
14666	0	0	while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++;
		0
		0
		0
14667
14668			size_t line_end = current;
14669	0	0	if (current < chars.size() - 1) {
14670	0		current++;
14671	0	0	if (current < chars.size() - 1 &&
		0
		0
14672	0	0	((chars[current-1].chr == '\r' && chars[current].chr == '\n') \|\|
		0
14673	0	0	(chars[current-1].chr == '\n' && chars[current].chr == '\r')))
14674	0		current++;
14675			}
14676
14677	0	0	if (line_start < line_end)
14678	0		tokens.emplace_back(line_start, line_end - line_start);
14679			else
14680			break;
14681			}
14682
14683	0		return true;
14684			}
14685
14686			} // namespace morphodita
14687
14688			/////////
14689			// File: unilib/version.h
14690			/////////
14691
14692			// This file is part of UniLib .
14693			//
14694			// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of
14695			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14696			//
14697			// This Source Code Form is subject to the terms of the Mozilla Public
14698			// License, v. 2.0. If a copy of the MPL was not distributed with this
14699			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14700			//
14701			// UniLib version: 3.3.0
14702			// Unicode version: 15.0.0
14703
14704			namespace unilib {
14705
14706	0		struct version {
14707			unsigned major;
14708			unsigned minor;
14709			unsigned patch;
14710			std::string prerelease;
14711
14712			// Returns current version.
14713			static version current();
14714			};
14715
14716			} // namespace unilib
14717
14718			/////////
14719			// File: morphodita/version/version.h
14720			/////////
14721
14722			// This file is part of MorphoDiTa .
14723			//
14724			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14725			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14726			//
14727			// This Source Code Form is subject to the terms of the Mozilla Public
14728			// License, v. 2.0. If a copy of the MPL was not distributed with this
14729			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14730
14731			namespace morphodita {
14732
14733	0		class version {
14734			public:
14735			unsigned major;
14736			unsigned minor;
14737			unsigned patch;
14738			string prerelease;
14739
14740			// Returns current MorphoDiTa version.
14741			static version current();
14742
14743			// Returns multi-line formated version and copyright string.
14744			static string version_and_copyright(const string& other_libraries = string());
14745			};
14746
14747			} // namespace morphodita
14748
14749			/////////
14750			// File: morphodita/version/version.cpp
14751			/////////
14752
14753			// This file is part of MorphoDiTa .
14754			//
14755			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14756			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14757			//
14758			// This Source Code Form is subject to the terms of the Mozilla Public
14759			// License, v. 2.0. If a copy of the MPL was not distributed with this
14760			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14761
14762			namespace morphodita {
14763
14764	0		version version::current() {
14765	0	0	return {1, 11, 1, "dev"};
		0
14766			}
14767
14768			// Returns multi-line formated version and copyright string.
14769	0		string version::version_and_copyright(const string& other_libraries) {
14770	0		ostringstream info;
14771
14772			auto morphodita = version::current();
14773			auto unilib = unilib::version::current();
14774
14775	0		info << "MorphoDiTa version " << morphodita.major << '.' << morphodita.minor << '.' << morphodita.patch
14776	0	0	<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease
		0
14777	0		<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch
14778	0	0	<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
		0
14779			"Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n"
14780	0	0	"Mathematics and Physics, Charles University in Prague, Czech Republic.";
14781
14782	0		return info.str();
14783			}
14784
14785			} // namespace morphodita
14786
14787			/////////
14788			// File: parsito/configuration/configuration.cpp
14789			/////////
14790
14791			// This file is part of Parsito .
14792			//
14793			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14794			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14795			//
14796			// This Source Code Form is subject to the terms of the Mozilla Public
14797			// License, v. 2.0. If a copy of the MPL was not distributed with this
14798			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14799
14800			namespace parsito {
14801
14802	1		void configuration::init(tree* t) {
14803	1	50	assert(t);
14804
14805			t->unlink_all_nodes();
14806	1		this->t = t;
14807
14808			stack.clear();
14809	2	50	if (!t->nodes.empty()) stack.push_back(0);
14810
14811			buffer.clear();
14812	1		buffer.reserve(t->nodes.size());
14813	8	100	for (size_t i = t->nodes.size(); i > 1; i--)
14814	14		buffer.push_back(i - 1);
14815	1		}
14816
14817	0		bool configuration::final() {
14818	67	0	return buffer.empty() && stack.size() <= 1;
		0
		0
		0
		0
		0
		0
		0
		100
		100
		0
		0
		0
		0
14819			}
14820
14821			} // namespace parsito
14822
14823			/////////
14824			// File: parsito/configuration/node_extractor.h
14825			/////////
14826
14827			// This file is part of Parsito .
14828			//
14829			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14830			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14831			//
14832			// This Source Code Form is subject to the terms of the Mozilla Public
14833			// License, v. 2.0. If a copy of the MPL was not distributed with this
14834			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14835
14836			namespace parsito {
14837
14838	1		class node_extractor {
14839			public:
14840			unsigned node_count() const;
14841			void extract(const configuration& conf, vector& nodes) const;
14842
14843			bool create(string_piece description, string& error);
14844
14845			private:
14846			enum start_t { STACK = 0, BUFFER = 1 };
14847			enum direction_t { PARENT = 0, CHILD = 1 };
14848	80		struct node_selector {
14849			pair start;
14850			vector> directions;
14851
14852			node_selector(start_t start, int start_index) : start(start, start_index) {}
14853			};
14854
14855			vector selectors;
14856			};
14857
14858			} // namespace parsito
14859
14860			/////////
14861			// File: parsito/configuration/node_extractor.cpp
14862			/////////
14863
14864			// This file is part of Parsito .
14865			//
14866			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14867			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14868			//
14869			// This Source Code Form is subject to the terms of the Mozilla Public
14870			// License, v. 2.0. If a copy of the MPL was not distributed with this
14871			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14872
14873			namespace parsito {
14874
14875	0		unsigned node_extractor::node_count() const {
14876	0		return selectors.size();
14877			}
14878
14879	62		void node_extractor::extract(const configuration& conf, vector& nodes) const {
14880			nodes.clear();
14881	1178	100	for (auto&& selector : selectors) {
14882			// Start by locating starting node
14883	1116		int current = -1;
14884	1116		switch (selector.start.first) {
14885			case STACK:
14886	930	100	if (selector.start.second < int(conf.stack.size()))
14887	867		current = conf.stack[conf.stack.size() - 1 - selector.start.second];
14888			break;
14889			case BUFFER:
14890	186	100	if (selector.start.second < int(conf.buffer.size()))
14891	98		current = conf.buffer[conf.buffer.size() - 1 - selector.start.second];
14892			break;
14893			}
14894
14895			// Follow directions to the final node
14896	1116	100	if (current >= 0)
14897	1212	100	for (auto&& direction : selector.directions) {
14898	802		const node& node = conf.t->nodes[current];
14899	802		switch (direction.first) {
14900			case PARENT:
14901	0	0	current = node.head ? node.head : -1;
14902	0		break;
14903			case CHILD:
14904	401	100	current = direction.second >= 0 && direction.second < int(node.children.size()) ?
14905	120		node.children[direction.second] :
14906	401	100	direction.second < 0 && -direction.second <= int(node.children.size()) ?
14907	127		node.children[node.children.size() + direction.second] :
14908	1330	100	-1;
		100
14909	802		break;
14910			}
14911	802	100	if (current <= 0) break;
14912			}
14913
14914			// Add the selected node
14915	1116		nodes.push_back(current);
14916			}
14917	62		}
14918
14919	1		bool node_extractor::create(string_piece description, string& error) {
14920	1		selectors.clear();
14921			error.clear();
14922
14923			vector lines, parts, words;
14924	1	50	split(description, '\n', lines);
14925	20	100	for (auto&& line : lines) {
14926	19	100	if (!line.len \|\| line.str[0] == '#') continue;
		50
14927
14928			// Separate start and directions
14929	18	50	split(line, ',', parts);
14930
14931			// Parse start
14932	18	50	split(parts[0], ' ', words);
14933	18	50	if (words.size() != 2)
14934	0	0	return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
		0
		0
		0
		0
14935
14936			start_t start;
14937	18	100	if (words[0] == "stack")
14938	15		start = STACK;
14939	3	50	else if (words[0] == "buffer")
14940	3		start = BUFFER;
14941			else
14942	0	0	return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
		0
		0
		0
		0
14943
14944			int start_index;
14945	18	50	if (!parse_int(words[1], "starting index", start_index, error)) return false;
		50
14946
14947	18	50	selectors.emplace_back(start, start_index);
14948
14949			// Parse directions
14950	34	100	for (size_t i = 1; i < parts.size(); i++) {
14951	16	50	split(parts[i], ' ', words);
14952	16	50	if (words.empty())
14953	0	0	return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false;
		0
		0
14954
14955	16	50	if (words[0] == "parent") {
14956	0	0	if (words.size() != 1)
14957	0	0	return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false;
		0
		0
		0
		0
14958	0	0	selectors.back().directions.emplace_back(PARENT, 0);
14959	16	50	} else if (words[0] == "child") {
14960	16	50	if (words.size() != 2)
14961	0	0	return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false;
		0
		0
		0
		0
14962			int child_index;
14963	16	50	if (!parse_int(words[1], "child index", child_index, error)) return false;
		50
14964	16	50	selectors.back().directions.emplace_back(CHILD, child_index);
14965			} else {
14966	0	0	return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false;
		0
		0
		0
		0
14967			}
14968			}
14969			}
14970
14971			return true;
14972			}
14973
14974			} // namespace parsito
14975
14976			/////////
14977			// File: parsito/configuration/value_extractor.h
14978			/////////
14979
14980			// This file is part of Parsito .
14981			//
14982			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
14983			// Mathematics and Physics, Charles University in Prague, Czech Republic.
14984			//
14985			// This Source Code Form is subject to the terms of the Mozilla Public
14986			// License, v. 2.0. If a copy of the MPL was not distributed with this
14987			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
14988
14989			namespace parsito {
14990
14991			class value_extractor {
14992			public:
14993			void extract(const node& n, string& value) const;
14994
14995			bool create(string_piece description, string& error);
14996
14997			private:
14998			enum value_t { FORM = 0, LEMMA = 1, LEMMA_ID = 2, TAG = 3, UNIVERSAL_TAG = 4,
14999			FEATS = 5, UNIVERSAL_TAG_FEATS = 6, DEPREL = 7 };
15000			value_t selector;
15001			};
15002
15003			} // namespace parsito
15004
15005			/////////
15006			// File: parsito/configuration/value_extractor.cpp
15007			/////////
15008
15009			// This file is part of Parsito .
15010			//
15011			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15012			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15013			//
15014			// This Source Code Form is subject to the terms of the Mozilla Public
15015			// License, v. 2.0. If a copy of the MPL was not distributed with this
15016			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15017
15018			namespace parsito {
15019
15020	2016		void value_extractor::extract(const node& n, string& value) const {
15021	2016		switch (selector) {
15022			case FORM:
15023	504		value.assign(n.form);
15024			break;
15025			case LEMMA:
15026	0		value.assign(n.lemma);
15027			break;
15028			case LEMMA_ID:
15029	0	0	if (!n.misc.empty()) {
15030			// Try finding LId= in misc column
15031	0		auto lid = n.misc.find("LId=");
15032	0	0	if (lid != string::npos) {
15033	0		lid += 4;
15034
15035			// Find optional \| ending the lemma_id
15036	0		auto lid_end = n.misc.find('\|', lid);
15037	0	0	if (lid_end == string::npos) lid_end = n.misc.size();
15038
15039			// Store the lemma_id
15040	0		value.assign(n.misc, lid, lid_end - lid);
15041	0		break;
15042			}
15043			}
15044	0		value.assign(n.lemma);
15045			break;
15046			case TAG:
15047	0		value.assign(n.xpostag);
15048			break;
15049			case UNIVERSAL_TAG:
15050	504		value.assign(n.upostag);
15051			break;
15052			case FEATS:
15053	504		value.assign(n.feats);
15054			break;
15055			case UNIVERSAL_TAG_FEATS:
15056	0		value.assign(n.upostag).append(n.feats);
15057			break;
15058			case DEPREL:
15059	504		value.assign(n.deprel);
15060			break;
15061			}
15062	2016		}
15063
15064	4		bool value_extractor::create(string_piece description, string& error) {
15065			error.clear();
15066
15067	4	100	if (description == "form")
15068	1		selector = FORM;
15069	3	50	else if (description == "lemma")
15070	0		selector = LEMMA;
15071	3	50	else if (description == "lemma_id")
15072	0		selector = LEMMA_ID;
15073	3	50	else if (description == "tag")
15074	0		selector = TAG;
15075	3	100	else if (description == "universal_tag")
15076	1		selector = UNIVERSAL_TAG;
15077	2	100	else if (description == "feats")
15078	1		selector = FEATS;
15079	1	50	else if (description == "universal_tag_feats")
15080	0		selector = UNIVERSAL_TAG_FEATS;
15081	1	50	else if (description == "deprel")
15082	1		selector = DEPREL;
15083			else
15084	0		return error.assign("Cannot parse value selector '").append(description.str, description.len).append("'!"), false;
15085
15086			return true;
15087			}
15088
15089			} // namespace parsito
15090
15091			/////////
15092			// File: parsito/embedding/embedding.h
15093			/////////
15094
15095			// This file is part of Parsito .
15096			//
15097			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15098			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15099			//
15100			// This Source Code Form is subject to the terms of the Mozilla Public
15101			// License, v. 2.0. If a copy of the MPL was not distributed with this
15102			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15103
15104			namespace parsito {
15105
15106	4		class embedding {
15107			public:
15108			unsigned dimension;
15109
15110			int lookup_word(const string& word, string& buffer) const;
15111			int unknown_word() const;
15112			float* weight(int id); // nullptr for wrong id
15113			const float* weight(int id) const; // nullpt for wrong id
15114
15115			bool can_update_weights(int id) const;
15116
15117			void load(binary_decoder& data);
15118			void save(binary_encoder& enc) const;
15119
15120			void create(unsigned dimension, int updatable_index, const vector>>& words, const vector& unknown_weights);
15121			void export_embeddings(vector>>& words, vector& unknown_weights) const;
15122			private:
15123			int updatable_index, unknown_index;
15124
15125			unordered_map dictionary;
15126			vector weights;
15127			};
15128
15129			} // namespace parsito
15130
15131			/////////
15132			// File: parsito/embedding/embedding.cpp
15133			/////////
15134
15135			// This file is part of Parsito .
15136			//
15137			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15138			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15139			//
15140			// This Source Code Form is subject to the terms of the Mozilla Public
15141			// License, v. 2.0. If a copy of the MPL was not distributed with this
15142			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15143
15144			namespace parsito {
15145
15146	128		int embedding::lookup_word(const string& word, string& buffer) const {
15147			using namespace unilib;
15148
15149			auto it = dictionary.find(word);
15150	128	100	if (it != dictionary.end()) return it->second;
15151
15152			// We now apply several heuristics to find a match
15153
15154			// Try locating uppercase/titlecase characters which we could lowercase
15155			bool first = true;
15156			unicode::category_t first_category = 0, other_categories = 0;
15157	54	100	for (auto&& chr : utf8::decoder(word)) {
15158	18	100	(first ? first_category : other_categories) \|= unicode::category(chr);
15159			first = false;
15160			}
15161
15162	36	50	if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) {
		0
15163			// Lowercase all characters but the first
15164			buffer.clear();
15165			first = true;
15166	0	0	for (auto&& chr : utf8::decoder(word)) {
15167	0	0	utf8::append(buffer, first ? chr : unicode::lowercase(chr));
15168			first = false;
15169			}
15170
15171			it = dictionary.find(buffer);
15172	0	0	if (it != dictionary.end()) return it->second;
15173			}
15174
15175	36	50	if ((first_category & unicode::Lut) \|\| (other_categories & unicode::Lut)) {
		50
15176			utf8::map(unicode::lowercase, word, buffer);
15177
15178			it = dictionary.find(buffer);
15179	0	0	if (it != dictionary.end()) return it->second;
15180			}
15181
15182			// If the word starts with digit and contain only digits and non-letter characters
15183			// i.e. large number, date, time, try replacing it with first digit only.
15184	36	50	if ((first_category & unicode::N) && !(other_categories & unicode::L)) {
		0
15185			buffer.clear();
15186	0		utf8::append(buffer, utf8::first(word));
15187
15188			it = dictionary.find(buffer);
15189	0	0	if (it != dictionary.end()) return it->second;
15190			}
15191
15192	36		return unknown_index;
15193			}
15194
15195	0		int embedding::unknown_word() const {
15196	0		return unknown_index;
15197			}
15198
15199	0		float* embedding::weight(int id) {
15200	0	0	if (id < 0 \|\| id * dimension >= weights.size()) return nullptr;
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
15201	0		return weights.data() + id * dimension;
15202			}
15203
15204	0		const float* embedding::weight(int id) const {
15205	58	0	if (id < 0 \|\| id * dimension >= weights.size()) return nullptr;
		0
		0
		50
		100
		100
		50
		50
		50
		0
		0
		0
		0
		0
		0
15206	54		return weights.data() + id * dimension;
15207			}
15208
15209	4		void embedding::load(binary_decoder& data) {
15210			// Load dimemsion
15211	4		dimension = data.next_4B();
15212
15213	4		updatable_index = numeric_limits::max();
15214
15215			// Load dictionary
15216			dictionary.clear();
15217			string word;
15218	27	50	for (unsigned size = data.next_4B(); size; size--) {
		100
15219	23	50	data.next_str(word);
15220	46		dictionary.emplace(word, (int)dictionary.size());
15221			}
15222
15223	4	50	unknown_index = data.next_1B() ? dictionary.size() : -1;
		50
15224
15225			// Load weights
15226	4	50	weights.resize(dimension * (dictionary.size() + (unknown_index >= 0)));
15227	4	50	memcpy(weights.data(), data.next(weights.size()), sizeof(float) * weights.size());
15228	4		}
15229
15230			} // namespace parsito
15231
15232			/////////
15233			// File: parsito/embedding/embedding_encode.cpp
15234			/////////
15235
15236			// This file is part of Parsito .
15237			//
15238			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15239			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15240			//
15241			// This Source Code Form is subject to the terms of the Mozilla Public
15242			// License, v. 2.0. If a copy of the MPL was not distributed with this
15243			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15244
15245			namespace parsito {
15246
15247	0		void embedding::save(binary_encoder& enc) const {
15248			// Save dimension and update_weight
15249	0		enc.add_4B(dimension);
15250
15251			// Save the dictionary
15252	0		vector words(dictionary.size());
15253	0	0	for (auto&& entry : dictionary) {
15254	0	0	assert(entry.second >= 0 && entry.second < int(dictionary.size()));
		0
15255	0		words[entry.second] = entry.first;
15256			}
15257	0		enc.add_4B(dictionary.size());
15258	0	0	for (auto&& word : words)
15259	0	0	enc.add_str(word);
15260
15261	0	0	enc.add_1B(unknown_index >= 0);
15262
15263			// Save the weights
15264			enc.add_data(weights);
15265	0		}
15266
15267	0		bool embedding::can_update_weights(int id) const {
15268	0		return id >= int(updatable_index);
15269			}
15270
15271	0		void embedding::create(unsigned dimension, int updatable_index, const vector>>& words, const vector& unknown_weights) {
15272	0		this->dimension = dimension;
15273	0		this->updatable_index = updatable_index;
15274
15275			dictionary.clear();
15276			weights.clear();
15277	0	0	for (auto&& word : words) {
15278	0	0	assert(word.second.size() == dimension);
15279	0		dictionary.emplace(word.first, (int)dictionary.size());
15280	0		weights.insert(weights.end(), word.second.begin(), word.second.end());
15281			}
15282
15283	0	0	if (unknown_weights.empty()) {
15284	0		this->unknown_index = -1;
15285			} else {
15286	0		this->unknown_index = dictionary.size();
15287	0		weights.insert(weights.end(), unknown_weights.begin(), unknown_weights.end());
15288			}
15289	0		}
15290
15291	0		void embedding::export_embeddings(vector>>& words, vector& unknown_weights) const {
15292			words.clear();
15293			unknown_weights.clear();
15294
15295	0	0	if (dictionary.empty()) return;
15296
15297	0	0	assert(unknown_index < 0 \|\| unknown_index == int(dictionary.size()));
		0
15298
15299	0		words.resize(dictionary.size());
15300	0	0	for (auto&& entry : dictionary) {
15301	0		words[entry.second].first = entry.first;
15302	0		words[entry.second].second.assign(weights.data() + entry.second * dimension, weights.data() + entry.second * dimension + dimension);
15303			}
15304	0	0	if (unknown_index >= 0)
15305	0		unknown_weights.assign(weights.data() + unknown_index * dimension, weights.data() + unknown_index * dimension + dimension);
15306			}
15307
15308			} // namespace parsito
15309
15310			/////////
15311			// File: parsito/network/activation_function.h
15312			/////////
15313
15314			// This file is part of Parsito .
15315			//
15316			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15317			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15318			//
15319			// This Source Code Form is subject to the terms of the Mozilla Public
15320			// License, v. 2.0. If a copy of the MPL was not distributed with this
15321			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15322
15323			namespace parsito {
15324
15325			struct activation_function {
15326			enum type { TANH = 0, CUBIC = 1, RELU = 2 };
15327
15328			static bool create(string_piece name, type& activation) {
15329			if (name == "tanh") return activation = TANH, true;
15330			if (name == "cubic") return activation = CUBIC, true;
15331			if (name == "relu") return activation = RELU, true;
15332			return false;
15333			}
15334			};
15335
15336			} // namespace parsito
15337
15338			/////////
15339			// File: parsito/network/neural_network.h
15340			/////////
15341
15342			// This file is part of Parsito .
15343			//
15344			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15345			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15346			//
15347			// This Source Code Form is subject to the terms of the Mozilla Public
15348			// License, v. 2.0. If a copy of the MPL was not distributed with this
15349			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15350
15351			namespace parsito {
15352
15353	7	0	class neural_network {
		0
		100
		50
		100
15354			public:
15355			typedef vector>> embeddings_cache;
15356
15357			void propagate(const vector& embeddings, const vector*>& embedding_ids_sequences,
15358			vector& hidden_layer, vector& outcomes, const embeddings_cache* cache = nullptr, bool softmax = true) const;
15359
15360			void load(binary_decoder& data);
15361			void generate_tanh_cache();
15362			void generate_embeddings_cache(const vector& embeddings, embeddings_cache& cache, unsigned max_words) const;
15363
15364			private:
15365			friend class neural_network_trainer;
15366
15367			void load_matrix(binary_decoder& data, vector>& m);
15368
15369			activation_function::type hidden_layer_activation;
15370			vector> weights[2];
15371
15372			vector tanh_cache;
15373			};
15374
15375			} // namespace parsito
15376
15377			/////////
15378			// File: parsito/network/neural_network.cpp
15379			/////////
15380
15381			// This file is part of Parsito .
15382			//
15383			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15384			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15385			//
15386			// This Source Code Form is subject to the terms of the Mozilla Public
15387			// License, v. 2.0. If a copy of the MPL was not distributed with this
15388			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15389
15390			namespace parsito {
15391
15392	2		void neural_network::load_matrix(binary_decoder& data, vector>& m) {
15393	2		unsigned rows = data.next_4B();
15394	2		unsigned columns = data.next_4B();
15395
15396	2		m.resize(rows);
15397	369	100	for (auto&& row : m) {
15398	367		row.resize(columns);
15399	367		memcpy(row.data(), data.next(columns), sizeof(float) * columns);
15400			}
15401	2		}
15402
15403	1		void neural_network::load(binary_decoder& data) {
15404	1		hidden_layer_activation = activation_function::type(data.next_1B());
15405	1		load_matrix(data, weights[0]);
15406	1		load_matrix(data, weights[1]);
15407	1		}
15408
15409	62		void neural_network::propagate(const vector& embeddings, const vector*>& embedding_ids_sequences,
15410			vector& hidden_layer, vector& outcomes, const embeddings_cache* cache, bool softmax) const {
15411	62	50	assert(!weights[0].empty());
15412	62	50	assert(!weights[1].empty());
15413	1178	100	for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size());
		100
		50
15414
15415	62		unsigned hidden_layer_size = weights[0].front().size();
15416	62		unsigned outcomes_size = weights[1].front().size();
15417
15418	124		outcomes.assign(outcomes_size, 0);
15419
15420			// Hidden layer
15421	62		hidden_layer.assign(hidden_layer_size, 0);
15422
15423			unsigned index = 0;
15424	1178	100	for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++)
15425	5580	100	for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++)
15426	6104	100	if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) {
		50
		100
15427	1640		unsigned word = embedding_ids_sequences[sequence]->at(i);
15428	3280	50	if (cache && i < cache->size() && word < cache->at(i).size()) {
		50
		50
		50
15429			// Use cache
15430	1640		const float* precomputed = cache->at(i)[word].data() + sequence * hidden_layer_size;
15431	9840	100	for (unsigned j = 0; j < hidden_layer_size; j++)
15432	16400		hidden_layer[j] += precomputed[j];
15433			} else {
15434			// Compute directly
15435			const float* embedding = embeddings[i].weight(word);
15436	0	0	for (unsigned j = 0; j < embeddings[i].dimension; j++)
15437	0	0	for (unsigned k = 0; k < hidden_layer_size; k++)
15438	0		hidden_layer[k] += embedding[j] * weights[0][index + j][k];
15439			}
15440			}
15441	372	100	for (unsigned i = 0; i < hidden_layer_size; i++) // Bias
15442	930		hidden_layer[i] += weights[0][index][i];
15443
15444			// Activation function
15445	62		switch (hidden_layer_activation) {
15446			case activation_function::TANH:
15447	62	50	if (!tanh_cache.empty())
15448	372	100	for (auto&& weight : hidden_layer)
15449	310	50	weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)];
		50
15450			else
15451	62	0	for (auto&& weight : hidden_layer)
15452	0		weight = tanh(weight);
15453			break;
15454			case activation_function::CUBIC:
15455	0	0	for (auto&& weight : hidden_layer)
15456	0		weight = weight * weight * weight;
15457			break;
15458			case activation_function::RELU:
15459	0	0	for (auto&& weight : hidden_layer)
15460	0	0	if (weight < 0) weight = 0;
15461			break;
15462			}
15463
15464	372	100	for (unsigned i = 0; i < hidden_layer_size; i++)
15465	4340	100	for (unsigned j = 0; j < outcomes_size; j++)
15466	16120		outcomes[j] += hidden_layer[i] * weights[1][i][j];
15467	868	100	for (unsigned i = 0; i < outcomes_size; i++) // Bias
15468	2418		outcomes[i] += weights[1][hidden_layer_size][i];
15469
15470			// Softmax if requested
15471	62	50	if (softmax) {
15472	62		float max = outcomes[0];
15473	806	100	for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i];
		100
15474
15475			float sum = 0;
15476	868	100	for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max));
15477	62		sum = 1 / sum;
15478
15479	868	100	for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum;
15480			}
15481	62		}
15482
15483	1		void neural_network::generate_tanh_cache() {
15484	1		tanh_cache.resize(2 * 10 * 32768);
15485	655361	100	for (unsigned i = 0; i < tanh_cache.size(); i++)
15486	655360		tanh_cache[i] = tanh(i / 32768.0 - 10);
15487	1		}
15488
15489	2		void neural_network::generate_embeddings_cache(const vector& embeddings, embeddings_cache& cache, unsigned max_words) const {
15490			unsigned embeddings_dim = 0;
15491	5	100	for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension;
15492
15493	1		unsigned sequences = weights[0].size() / embeddings_dim;
15494	1	50	assert(sequences * embeddings_dim + 1 == weights[0].size());
15495
15496	1		unsigned hidden_layer_size = weights[0].front().size();
15497
15498	1		cache.resize(embeddings.size());
15499	5	100	for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) {
15500			unsigned words = 0;
15501	35	50	while (words < max_words && embeddings[i].weight(words)) words++;
		100
		100
15502
15503	4		cache[i].resize(words);
15504	31	100	for (unsigned word = 0; word < words; word++) {
15505	27		const float* embedding = embeddings[i].weight(word);
15506
15507	27		cache[i][word].assign(sequences * hidden_layer_size, 0);
15508	513	100	for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++)
15509	2916	100	for (unsigned j = 0; j < embeddings[i].dimension; j++)
15510	14580	100	for (unsigned k = 0; k < hidden_layer_size; k++)
15511	36450		cache[i][word][sequence * hidden_layer_size + k] += embedding[j] * weights[0][index + j][k];
15512			}
15513			}
15514	1		}
15515
15516			} // namespace parsito
15517
15518			/////////
15519			// File: parsito/network/network_parameters.h
15520			/////////
15521
15522			// This file is part of Parsito .
15523			//
15524			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15525			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15526			//
15527			// This Source Code Form is subject to the terms of the Mozilla Public
15528			// License, v. 2.0. If a copy of the MPL was not distributed with this
15529			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15530
15531			namespace parsito {
15532
15533			struct network_trainer {
15534			enum network_trainer_algorithm {
15535			SGD,
15536			SGD_MOMENTUM,
15537			ADAGRAD,
15538			ADADELTA,
15539			ADAM,
15540			};
15541
15542			network_trainer_algorithm algorithm;
15543			float learning_rate, learning_rate_final;
15544			float momentum, momentum2;
15545			float epsilon;
15546			};
15547
15548			struct network_parameters {
15549			unsigned iterations;
15550			int structured_interval;
15551			unsigned hidden_layer;
15552			activation_function::type hidden_layer_type;
15553			network_trainer trainer;
15554			unsigned batch_size;
15555			float initialization_range;
15556			float l1_regularization;
15557			float l2_regularization;
15558			float maxnorm_regularization;
15559			float dropout_hidden, dropout_input;
15560			bool early_stopping;
15561			};
15562
15563			} // namespace parsito
15564
15565			/////////
15566			// File: parsito/network/neural_network_trainer.h
15567			/////////
15568
15569			// This file is part of Parsito .
15570			//
15571			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15572			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15573			//
15574			// This Source Code Form is subject to the terms of the Mozilla Public
15575			// License, v. 2.0. If a copy of the MPL was not distributed with this
15576			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15577
15578			namespace parsito {
15579
15580			class neural_network_trainer {
15581			public:
15582			neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size,
15583			const network_parameters& parameters, mt19937& generator);
15584
15585			bool next_iteration();
15586
15587	0	0	struct workspace {
		0
		0
		0
		0
		0
15588			unsigned batch = 0;
15589			vector outcomes;
15590			vector hidden_layer;
15591			vector error_outcomes;
15592			vector error_hidden;
15593
15594			// Delta accumulators
15595			vector> weights_batch[2];
15596			vector>> error_embedding;
15597			vector> error_embedding_nonempty;
15598
15599			// Trainer data
15600			struct trainer_data {
15601			float delta = 0;
15602			float gradient = 0;
15603			};
15604			vector> weights_trainer[2];
15605			vector>> embedding_trainer;
15606
15607			// Dropout vectors
15608			vector input_dropout;
15609			vector hidden_dropout;
15610			vector hidden_kept;
15611			};
15612			void propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, workspace& w) const;
15613			void backpropagate(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w);
15614
15615			void finalize_sentence();
15616
15617			void save_network(binary_encoder& enc) const;
15618
15619			private:
15620			struct trainer_sgd {
15621			static bool need_trainer_data;
15622			static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15623			};
15624			struct trainer_sgd_momentum {
15625			static bool need_trainer_data;
15626			static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15627			};
15628			struct trainer_adagrad {
15629			static bool need_trainer_data;
15630			static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15631			};
15632			struct trainer_adadelta {
15633			static bool need_trainer_data;
15634			static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15635			};
15636			struct trainer_adam {
15637			static bool need_trainer_data;
15638			static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data);
15639			};
15640			template void backpropagate_template(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w);
15641
15642			void l1_regularize();
15643			void maxnorm_regularize();
15644
15645			void save_matrix(const vector>& m, binary_encoder& enc) const;
15646
15647			neural_network& network;
15648			mt19937& generator;
15649			unsigned iteration, iterations, steps;
15650			network_trainer trainer;
15651			unsigned batch_size;
15652			float l1_regularization, l2_regularization, maxnorm_regularization;
15653			float dropout_hidden, dropout_input;
15654			};
15655
15656			} // namespace parsito
15657
15658			/////////
15659			// File: parsito/network/neural_network_trainer.cpp
15660			/////////
15661
15662			// This file is part of Parsito .
15663			//
15664			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
15665			// Mathematics and Physics, Charles University in Prague, Czech Republic.
15666			//
15667			// This Source Code Form is subject to the terms of the Mozilla Public
15668			// License, v. 2.0. If a copy of the MPL was not distributed with this
15669			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
15670
15671			namespace parsito {
15672
15673	0		neural_network_trainer::neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size,
15674	0		const network_parameters& parameters, mt19937& generator) : network(network), generator(generator) {
15675			// Initialize hidden layer
15676	0		network.hidden_layer_activation = parameters.hidden_layer_type;
15677	0	0	if (parameters.hidden_layer) {
15678	0		float uniform_pre_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range :
15679	0	0	-parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer));
15680	0		uniform_real_distribution uniform_pre_hidden(-uniform_pre_hidden_range, uniform_pre_hidden_range);
15681
15682	0		network.weights[0].resize(input_size + 1/bias/);
15683	0	0	for (auto&& row : network.weights[0]) {
15684	0		row.resize(parameters.hidden_layer);
15685	0	0	for (auto&& weight : row)
15686	0		weight = uniform_pre_hidden(generator);
15687			}
15688
15689	0		float uniform_post_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range :
15690	0	0	-parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer));
15691	0		uniform_real_distribution uniform_post_hidden(-uniform_post_hidden_range, uniform_post_hidden_range);
15692
15693	0		network.weights[1].resize(parameters.hidden_layer + 1/bias/);
15694	0	0	for (auto&& row : network.weights[1]) {
15695	0		row.resize(output_size);
15696	0	0	for (auto&& weight : row)
15697	0		weight = uniform_post_hidden(generator);
15698			}
15699			}
15700
15701			// Store the network_parameters
15702	0		iteration = steps = 0;
15703	0		iterations = parameters.iterations;
15704	0		trainer = parameters.trainer;
15705	0		batch_size = parameters.batch_size;
15706	0		l1_regularization = parameters.l1_regularization;
15707	0		l2_regularization = parameters.l2_regularization;
15708	0		maxnorm_regularization = parameters.maxnorm_regularization;
15709	0		dropout_hidden = parameters.dropout_hidden;
15710	0		dropout_input = parameters.dropout_input;
15711
15712			// Maxnorm regularize the created weights
15713	0	0	if (maxnorm_regularization) maxnorm_regularize();
15714	0		}
15715
15716	0		bool neural_network_trainer::next_iteration() {
15717	0	0	if (iteration++ >= iterations) return false;
15718
15719	0	0	if (trainer.algorithm != network_trainer::ADADELTA)
15720	0	0	if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1)
		0
15721			trainer.learning_rate =
15722	0		exp(((iterations - iteration) * log(trainer.learning_rate) + log(trainer.learning_rate_final)) / (iterations - iteration + 1));
15723
15724			return true;
15725			}
15726
15727	0		void neural_network_trainer::propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, workspace& w) const {
15728			// Initialize dropout if requested
15729	0	0	if (dropout_input) {
15730	0		w.input_dropout.resize(network.weights[0].size());
15731	0		bernoulli_distribution dropout(dropout_input);
15732	0	0	for (auto&& flag : w.input_dropout)
15733	0		flag = dropout(generator);
15734			}
15735
15736	0	0	if (dropout_hidden) {
15737	0		w.hidden_dropout.resize(network.weights[1].size());
15738	0		bernoulli_distribution dropout(dropout_hidden);
15739	0	0	for (auto&& flag : w.hidden_dropout)
15740	0		flag = dropout(generator);
15741			}
15742			w.hidden_kept.clear();
15743	0	0	for (unsigned i = 0; i < network.weights[0].front().size(); i++)
15744	0	0	if (w.hidden_dropout.empty() \|\| !w.hidden_dropout[i])
		0
		0
15745	0		w.hidden_kept.push_back(i);
15746
15747			// Propagate
15748			unsigned hidden_layer_size = network.weights[0].front().size();
15749	0		unsigned outcomes_size = network.weights[1].front().size();
15750
15751	0		w.outcomes.assign(outcomes_size, 0);
15752
15753			// Hidden layer
15754	0		w.hidden_layer.assign(hidden_layer_size, 0);
15755
15756			unsigned index = 0;
15757	0	0	for (auto&& embedding_ids : embedding_ids_sequences)
15758			// Note: The unnecessary brackets on the following for cycle are needed
15759			// to compile on VS 2015 Update 3, which otherwise fail to compile it.
15760	0	0	for (unsigned i = 0; i < embeddings.size(); i++) {
15761	0	0	if (embedding_ids && (*embedding_ids)[i] >= 0) {
		0
		0
15762	0		const float* embedding = embeddings[i].weight((*embedding_ids)[i]);
15763	0	0	for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++)
15764	0	0	if (w.input_dropout.empty() \|\| !w.input_dropout[index])
		0
		0
15765	0	0	for (auto&& j : w.hidden_kept)
15766	0		w.hidden_layer[j] += embedding network.weights[0][index][j];
15767			} else {
15768	0		index += embeddings[i].dimension;
15769			}
15770			}
15771	0	0	if (dropout_input) { // Dropout normalization
15772	0		float dropout_factor = 1. / (1. - dropout_input);
15773	0	0	for (auto&& i : w.hidden_kept)
15774	0		w.hidden_layer[i] *= dropout_factor;
15775			}
15776	0	0	for (auto&& i : w.hidden_kept) // Bias
15777	0		w.hidden_layer[i] += network.weights[0][index][i];
15778
15779			// Activation function
15780	0		switch (network.hidden_layer_activation) {
15781			case activation_function::TANH:
15782	0	0	for (auto&& weight : w.hidden_layer)
15783	0		weight = tanh(weight);
15784			break;
15785			case activation_function::CUBIC:
15786	0	0	for (auto&& weight : w.hidden_layer)
15787	0		weight = weight * weight * weight;
15788			break;
15789			case activation_function::RELU:
15790	0	0	for (auto&& weight : w.hidden_layer)
15791	0	0	if (weight < 0) weight = 0;
15792			break;
15793			}
15794	0	0	if (dropout_hidden) { // Dropout normalization
15795	0		float dropout_factor = 1. / (1. - dropout_hidden);
15796	0	0	for (auto&& i : w.hidden_kept)
15797	0		w.hidden_layer[i] *= dropout_factor;
15798			}
15799
15800	0	0	for (auto&& i : w.hidden_kept)
15801	0	0	for (unsigned j = 0; j < outcomes_size; j++)
15802	0		w.outcomes[j] += w.hidden_layer[i] * network.weights[1][i][j];
15803	0	0	for (unsigned i = 0; i < outcomes_size; i++) // Bias
15804	0		w.outcomes[i] += network.weights[1][hidden_layer_size][i];
15805
15806			// Softmax
15807	0		float max = w.outcomes[0];
15808	0	0	for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i];
		0
15809
15810			float sum = 0;
15811	0	0	for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max));
15812	0		sum = 1 / sum;
15813
15814	0	0	for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum;
15815	0		}
15816
15817			// SGD
15818			bool neural_network_trainer::trainer_sgd::need_trainer_data = false;
15819			float neural_network_trainer::trainer_sgd::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& /data/) {
15820	0		return trainer.learning_rate * gradient;
15821			}
15822
15823			// SGD with momentum
15824			bool neural_network_trainer::trainer_sgd_momentum::need_trainer_data = true;
15825			float neural_network_trainer::trainer_sgd_momentum::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15826	0		data.delta = trainer.momentum * data.delta + trainer.learning_rate * gradient;
15827			return data.delta;
15828			}
15829
15830			// AdaGrad
15831			bool neural_network_trainer::trainer_adagrad::need_trainer_data = true;
15832			float neural_network_trainer::trainer_adagrad::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15833	0		data.gradient += gradient * gradient;
15834	0		return trainer.learning_rate / sqrt(data.gradient + trainer.epsilon) * gradient;
15835			}
15836
15837			// AdaDelta
15838			bool neural_network_trainer::trainer_adadelta::need_trainer_data = true;
15839	0		float neural_network_trainer::trainer_adadelta::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15840	0		data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient * gradient;
15841	0		float delta = sqrt(data.delta + trainer.epsilon) / sqrt(data.gradient + trainer.epsilon) * gradient;
15842	0		data.delta = trainer.momentum * data.delta + (1 - trainer.momentum) * delta * delta;
15843	0		return delta;
15844			}
15845
15846			// Adam
15847			bool neural_network_trainer::trainer_adam::need_trainer_data = true;
15848	0		float neural_network_trainer::trainer_adam::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) {
15849	0		data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient;
15850	0		data.delta = trainer.momentum2 * data.delta + (1 - trainer.momentum2) * gradient * gradient;
15851	0		return trainer.learning_rate * data.gradient / sqrt(data.delta + trainer.epsilon);
15852			}
15853
15854			// Backpropagation
15855			template
15856	0		void neural_network_trainer::backpropagate_template(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) {
15857	0		size_t hidden_layer_size = network.weights[0].front().size();
15858	0		size_t outcomes_size = network.weights[1].front().size();
15859
15860			// Allocate space for delta accumulators
15861	0	0	if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size());
		0
		0
		0
		0
15862	0	0	if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size());
		0
		0
		0
		0
15863	0	0	if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size());
		0
		0
		0
		0
15864	0	0	if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size());
		0
		0
		0
		0
15865
15866			// Allocate space for trainer_data if required)
15867	0		workspace::trainer_data none_trainer_data;
15868	0	0	if (TRAINER::need_trainer_data) {
		0
		0
		0
		0
15869	0	0	while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size());
		0
		0
		0
		0
15870	0	0	while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size);
		0
		0
		0
		0
15871			}
15872
15873			// Compute error vector
15874	0		w.error_outcomes.resize(outcomes_size);
15875	0	0	for (unsigned i = 0; i < outcomes_size; i++)
		0
		0
		0
		0
15876	0	0	w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i];
		0
		0
		0
		0
15877
15878			// Backpropagate error_outcomes to error_hidden
15879	0		w.error_hidden.assign(hidden_layer_size, 0);
15880	0	0	for (auto&& i : w.hidden_kept)
		0
		0
		0
		0
15881	0	0	for (unsigned j = 0; j < outcomes_size; j++)
		0
		0
		0
		0
15882	0		w.error_hidden[i] += network.weights[1][i][j] * w.error_outcomes[j];
15883			// Dropout normalization
15884	0	0	if (dropout_hidden) {
		0
		0
		0
		0
15885	0		float dropout_factor = 1. / (1. - dropout_hidden);
15886	0	0	for (auto&& i : w.hidden_kept)
		0
		0
		0
		0
15887	0		w.error_hidden[i] *= dropout_factor;
15888			}
15889
15890			// Perform activation function derivation
15891	0		switch (network.hidden_layer_activation) {
15892			case activation_function::TANH:
15893	0	0	for (auto&& i : w.hidden_kept)
		0
		0
		0
		0
15894	0		w.error_hidden[i] = 1 - w.hidden_layer[i] w.hidden_layer[i];
15895			break;
15896			case activation_function::CUBIC:
15897	0	0	for (auto&& i : w.hidden_kept) {
		0
		0
		0
		0
15898	0		float hidden_layer = cbrt(w.hidden_layer[i]);
15899	0		w.error_hidden[i] = 3 hidden_layer * hidden_layer;
15900			}
15901			break;
15902			case activation_function::RELU:
15903	0	0	for (auto&& i : w.hidden_kept)
		0
		0
		0
		0
15904	0	0	if (w.hidden_layer[i] <= 0)
		0
		0
		0
		0
15905	0		w.error_hidden[i] = 0;
15906			break;
15907			}
15908
15909			// Update weights[1]
15910	0	0	for (auto&& i : w.hidden_kept) {
		0
		0
		0
		0
15911	0	0	if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size);
		0
		0
		0
		0
15912	0	0	for (unsigned j = 0; j < outcomes_size; j++)
		0
		0
		0
		0
15913	0		w.weights_batch[1][i][j] += w.hidden_layer[i] * w.error_outcomes[j];
15914			}
15915			// Bias
15916	0	0	if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size);
		0
		0
		0
		0
15917	0	0	for (unsigned i = 0; i < outcomes_size; i++)
		0
		0
		0
		0
15918	0		w.weights_batch[1][hidden_layer_size][i] += w.error_outcomes[i];
15919
15920			// Dropout normalization
15921	0	0	if (dropout_input) {
		0
		0
		0
		0
15922	0		float dropout_factor = 1. / (1. - dropout_input);
15923	0	0	for (auto&& i : w.hidden_kept)
		0
		0
		0
		0
15924	0		w.error_hidden[i] *= dropout_factor;
15925			}
15926			// Update weights[0] and backpropagate to error_embedding
15927			unsigned index = 0;
15928	0	0	for (auto&& embedding_ids : embedding_ids_sequences)
		0
		0
		0
		0
15929			// Note: The unnecessary brackets on the following for cycle are needed
15930			// to compile on VS 2015 Update 3, which otherwise fail to compile it.
15931	0	0	for (unsigned i = 0; i < embeddings.size(); i++) {
		0
		0
		0
		0
15932	0	0	if (embedding_ids && (*embedding_ids)[i] >= 0) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
15933	0		int embedding_id = (*embedding_ids)[i];
15934
15935			float* error_embedding = nullptr; // Accumulate embedding error if required
15936	0	0	if (embeddings[i].can_update_weights(embedding_id)) {
		0
		0
		0
		0
15937	0	0	if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1);
		0
		0
		0
		0
		0
		0
		0
		0
		0
15938	0	0	if (w.error_embedding[i][embedding_id].empty()) {
		0
		0
		0
		0
15939	0		w.error_embedding[i][embedding_id].assign(embeddings[i].dimension, 0);
15940	0	0	w.error_embedding_nonempty[i].emplace_back(embedding_id);
		0
		0
		0
		0
15941			}
15942	0		error_embedding = w.error_embedding[i][embedding_id].data();
15943			}
15944
15945	0		const float* embedding = embeddings[i].weight(embedding_id);
15946	0	0	for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding)
		0
		0
		0
		0
		0
		0
		0
		0
		0
15947	0	0	if (w.input_dropout.empty() \|\| !w.input_dropout[index]) {
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
15948	0	0	if (error_embedding)
		0
		0
		0
		0
15949	0	0	for (auto&& j : w.hidden_kept)
		0
		0
		0
		0
15950	0		error_embedding += network.weights[0][index][j] w.error_hidden[j];
15951	0	0	if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
		0
		0
		0
		0
		0
		0
		0
		0
		0
15952	0	0	for (auto&& j : w.hidden_kept)
		0
		0
		0
		0
15953	0		w.weights_batch[0][index][j] += embedding w.error_hidden[j];
15954			}
15955			} else {
15956	0		index += embeddings[i].dimension;
15957			}
15958			}
15959			// Bias
15960			{
15961	0		float negate_input_dropout = 1. - dropout_hidden;
15962	0	0	if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size);
		0
		0
		0
		0
15963	0	0	for (auto&& i : w.hidden_kept)
		0
		0
		0
		0
15964	0		w.weights_batch[0][index][i] += w.error_hidden[i] * negate_input_dropout;
15965			}
15966
15967			// End if not at the end of the batch
15968	0	0	if (++w.batch < batch_size) return;
		0
		0
		0
		0
15969	0		w.batch = 0;
15970
15971			// Update hidden weights
15972	0	0	if (!network.weights[0].empty())
		0
		0
		0
		0
15973	0	0	for (int i = 0; i < 2; i++) {
		0
		0
		0
		0
15974	0	0	for (unsigned j = 0; j < w.weights_batch[i].size(); j++)
		0
		0
		0
		0
15975	0	0	if (!w.weights_batch[i][j].empty()) {
		0
		0
		0
		0
15976	0	0	for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++)
		0
		0
		0
		0
15977	0	0	network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /bias/ 0. : l2_regularization) * network.weights[i][j][k];
		0
		0
		0
		0
		0
		0
		0
		0
15978			w.weights_batch[i][j].clear();
15979			}
15980			}
15981
15982			// Update embedding weights using error_embedding
15983	0	0	for (unsigned i = 0; i < embeddings.size(); i++) {
		0
		0
		0
		0
15984	0	0	for (auto&& id : w.error_embedding_nonempty[i]) {
		0
		0
		0
		0
15985	0	0	if (TRAINER::need_trainer_data) {
		0
		0
		0
		0
15986	0	0	if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1);
		0
		0
		0
		0
15987	0	0	if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1);
		0
		0
		0
		0
15988	0	0	if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension);
		0
		0
		0
		0
15989			}
15990	0		float* embedding = embeddings[i].weight(id);
15991	0	0	for (unsigned j = 0; j < embeddings[i].dimension; j++)
		0
		0
		0
		0
15992	0	0	embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j];
		0
		0
		0
15993	0		w.error_embedding[i][id].clear();
15994			}
15995			w.error_embedding_nonempty[i].clear();
15996			}
15997
15998			// Maxnorm regularize the updated weights
15999	0	0	if (maxnorm_regularization) maxnorm_regularize();
		0
		0
		0
		0
16000			}
16001
16002	0		void neural_network_trainer::backpropagate(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) {
16003	0		steps++;
16004
16005	0		switch (trainer.algorithm) {
16006			case network_trainer::SGD:
16007	0		backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w);
16008	0		return;
16009			case network_trainer::SGD_MOMENTUM:
16010	0		backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w);
16011	0		return;
16012			case network_trainer::ADAGRAD:
16013	0		backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w);
16014	0		return;
16015			case network_trainer::ADADELTA:
16016	0		backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w);
16017	0		return;
16018			case network_trainer::ADAM:
16019	0		float original_learning_rate = trainer.learning_rate;
16020	0		trainer.learning_rate *= sqrt(1-pow(trainer.momentum2, steps)) / (1-pow(trainer.momentum, steps));
16021	0		backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w);
16022	0		trainer.learning_rate = original_learning_rate;
16023	0		return;
16024			}
16025
16026	0	0	training_failure("Internal error, unsupported trainer!");
		0
16027			}
16028
16029	0		void neural_network_trainer::l1_regularize() {
16030	0	0	if (!l1_regularization) return;
16031
16032	0	0	for (auto&& weights : network.weights)
16033	0	0	for (unsigned i = 0; i + 1 /ignore biases/ < weights.size(); i++) {
16034	0		auto& row = weights[i];
16035	0	0	for (auto&& weight : row)
16036	0	0	if (weight < l1_regularization) weight += l1_regularization;
16037	0	0	else if (weight > l1_regularization) weight -= l1_regularization;
16038	0		else weight = 0;
16039			}
16040			}
16041
16042	0		void neural_network_trainer::maxnorm_regularize() {
16043	0	0	if (!maxnorm_regularization) return;
16044
16045	0	0	for (unsigned i = 0; i < 2; i++)
16046	0	0	for (unsigned j = 0; j < network.weights[i].front().size(); j++) {
16047			float length = 0;
16048	0	0	for (auto&& row : network.weights[i])
16049	0		length += row[j] * row[j];
16050
16051	0	0	if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) {
		0
16052	0		float factor = 1 / sqrt(length / (maxnorm_regularization * maxnorm_regularization));
16053	0	0	for (auto&& row : network.weights[i])
16054	0		row[j] *= factor;
16055			}
16056			}
16057			}
16058
16059	0		void neural_network_trainer::finalize_sentence() {
16060	0	0	if (l1_regularization) l1_regularize();
		0
		0
16061	0		}
16062
16063	0		void neural_network_trainer::save_matrix(const vector>& m, binary_encoder& enc) const {
16064	0		enc.add_4B(m.size());
16065	0	0	enc.add_4B(m.empty() ? 0 : m.front().size());
16066
16067	0	0	for (auto&& row : m) {
16068	0	0	assert(row.size() == m.front().size());
16069			enc.add_data(row);
16070			}
16071	0		}
16072
16073	0		void neural_network_trainer::save_network(binary_encoder& enc) const {
16074	0		enc.add_1B(network.hidden_layer_activation);
16075	0		save_matrix(network.weights[0], enc);
16076	0		save_matrix(network.weights[1], enc);
16077	0		}
16078
16079			} // namespace parsito
16080
16081			/////////
16082			// File: parsito/transition/transition.h
16083			/////////
16084
16085			// This file is part of Parsito .
16086			//
16087			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16088			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16089			//
16090			// This Source Code Form is subject to the terms of the Mozilla Public
16091			// License, v. 2.0. If a copy of the MPL was not distributed with this
16092			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16093
16094			namespace parsito {
16095
16096			// Abstract transition class
16097	13		class transition {
16098			public:
16099	13		virtual ~transition() {}
16100
16101			virtual bool applicable(const configuration& conf) const = 0;
16102			virtual int perform(configuration& conf) const = 0;
16103			};
16104
16105			// Specific transition classes
16106	12		class transition_left_arc : public transition {
16107			public:
16108	6		transition_left_arc(const string& label) : label(label), label_is_root(label == "root") {}
16109
16110			virtual bool applicable(const configuration& conf) const override;
16111			virtual int perform(configuration& conf) const override;
16112			private:
16113			string label;
16114			bool label_is_root;
16115			};
16116
16117	12		class transition_right_arc : public transition {
16118			public:
16119	6		transition_right_arc(const string& label) : label(label), label_is_root(label == "root") {}
16120
16121			virtual bool applicable(const configuration& conf) const override;
16122			virtual int perform(configuration& conf) const override;
16123			private:
16124			string label;
16125			bool label_is_root;
16126			};
16127
16128	2		class transition_shift : public transition {
16129			public:
16130			virtual bool applicable(const configuration& conf) const override;
16131			virtual int perform(configuration& conf) const override;
16132			};
16133
16134	0		class transition_swap : public transition {
16135			public:
16136			virtual bool applicable(const configuration& conf) const override;
16137			virtual int perform(configuration& conf) const override;
16138			};
16139
16140	0		class transition_left_arc_2 : public transition {
16141			public:
16142	0		transition_left_arc_2(const string& label) : label(label), label_is_root(label == "root") {}
16143
16144			virtual bool applicable(const configuration& conf) const override;
16145			virtual int perform(configuration& conf) const override;
16146			private:
16147			string label;
16148			bool label_is_root;
16149			};
16150
16151	0		class transition_right_arc_2 : public transition {
16152			public:
16153	0		transition_right_arc_2(const string& label) : label(label), label_is_root(label == "root") {}
16154
16155			virtual bool applicable(const configuration& conf) const override;
16156			virtual int perform(configuration& conf) const override;
16157			private:
16158			string label;
16159			bool label_is_root;
16160			};
16161
16162			} // namespace parsito
16163
16164			/////////
16165			// File: parsito/transition/transition_oracle.h
16166			/////////
16167
16168			// This file is part of Parsito .
16169			//
16170			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16171			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16172			//
16173			// This Source Code Form is subject to the terms of the Mozilla Public
16174			// License, v. 2.0. If a copy of the MPL was not distributed with this
16175			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16176
16177			namespace parsito {
16178
16179	0		class transition_oracle {
16180			public:
16181	0		virtual ~transition_oracle() {}
16182
16183			struct predicted_transition {
16184			unsigned best;
16185			unsigned to_follow;
16186
16187			predicted_transition(unsigned best, unsigned to_follow) : best(best), to_follow(to_follow) {}
16188			};
16189
16190	0		class tree_oracle {
16191			public:
16192	0		virtual ~tree_oracle() {}
16193
16194			virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const = 0;
16195			virtual void interesting_transitions(const configuration& conf, vector& transitions) const = 0;
16196			};
16197
16198			virtual unique_ptr create_tree_oracle(const tree& gold) const = 0;
16199			};
16200
16201			} // namespace parsito
16202
16203			/////////
16204			// File: parsito/transition/transition_system.h
16205			/////////
16206
16207			// This file is part of Parsito .
16208			//
16209			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16210			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16211			//
16212			// This Source Code Form is subject to the terms of the Mozilla Public
16213			// License, v. 2.0. If a copy of the MPL was not distributed with this
16214			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16215
16216			namespace parsito {
16217
16218			class transition_system {
16219			public:
16220	1		virtual ~transition_system() {}
16221
16222			virtual unsigned transition_count() const;
16223			virtual bool applicable(const configuration& conf, unsigned transition) const;
16224			virtual int perform(configuration& conf, unsigned transition) const;
16225			virtual transition_oracle* oracle(const string& name) const = 0;
16226
16227			static transition_system* create(const string& name, const vector& labels);
16228
16229			protected:
16230	1		transition_system(const vector& labels) : labels(labels) {}
16231
16232			const vector& labels;
16233			vector> transitions;
16234			};
16235
16236			} // namespace parsito
16237
16238			/////////
16239			// File: parsito/parser/parser_nn.h
16240			/////////
16241
16242			// This file is part of Parsito .
16243			//
16244			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16245			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16246			//
16247			// This Source Code Form is subject to the terms of the Mozilla Public
16248			// License, v. 2.0. If a copy of the MPL was not distributed with this
16249			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16250
16251			namespace parsito {
16252
16253	5		class parser_nn : public parser {
16254			public:
16255			parser_nn(bool versioned);
16256
16257			virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const override;
16258
16259			protected:
16260			virtual void load(binary_decoder& data, unsigned cache) override;
16261
16262			private:
16263			friend class parser_nn_trainer;
16264			void parse_greedy(tree& t, double* cost) const;
16265			void parse_beam_search(tree& t, unsigned beam_size, double* cost) const;
16266
16267			bool versioned;
16268			unsigned version;
16269			bool single_root;
16270			enum { VERSION_LATEST = 2 };
16271
16272			vector labels;
16273			unique_ptr system;
16274
16275			node_extractor nodes;
16276
16277			vector values;
16278			vector embeddings;
16279
16280			neural_network network;
16281			neural_network::embeddings_cache embeddings_cache;
16282
16283	6	50	struct workspace {
		100
16284	4	100	workspace(bool single_root) : conf(single_root) {}
16285
16286			configuration conf;
16287
16288			string word, word_buffer;
16289			vector> embeddings;
16290			vector> embeddings_values;
16291
16292			vector extracted_nodes;
16293			vector*> extracted_embeddings;
16294
16295			vector outcomes, network_buffer;
16296
16297			// Beam-size structures
16298	228		struct beam_size_configuration {
16299			beam_size_configuration(bool single_root) : conf(single_root) {}
16300
16301			configuration conf;
16302			vector heads;
16303			vector deprels;
16304			double cost;
16305
16306			void refresh_tree();
16307			void save_tree();
16308			};
16309			struct beam_size_alternative {
16310			const beam_size_configuration* bs_conf;
16311			int transition;
16312			double cost;
16313			bool operator<(const beam_size_alternative& other) const { return cost > other.cost; }
16314
16315			beam_size_alternative(const beam_size_configuration* bs_conf, int transition, double cost)
16316	241		: bs_conf(bs_conf), transition(transition), cost(cost) {}
16317			};
16318			vector bs_confs[2]; size_t bs_confs_size[2];
16319			vector bs_alternatives;
16320			};
16321			mutable threadsafe_stack workspaces;
16322			};
16323
16324			} // namespace parsito
16325
16326			/////////
16327			// File: parsito/parser/parser.cpp
16328			/////////
16329
16330			// This file is part of Parsito .
16331			//
16332			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16333			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16334			//
16335			// This Source Code Form is subject to the terms of the Mozilla Public
16336			// License, v. 2.0. If a copy of the MPL was not distributed with this
16337			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16338
16339			namespace parsito {
16340
16341	0		parser* parser::load(const char* file, unsigned cache) {
16342	0	0	ifstream in(path_from_utf8(file).c_str(), ifstream::in \| ifstream::binary);
16343	0	0	if (!in.is_open()) return nullptr;
16344	0	0	return load(in, cache);
16345			}
16346
16347	1		parser* parser::load(istream& in, unsigned cache) {
16348			unique_ptr result;
16349
16350			binary_decoder data;
16351	1	50	if (!compressor::load(in, data)) return nullptr;
		50
16352
16353			try {
16354			string name;
16355	1	50	data.next_str(name);
16356
16357	1	50	result.reset(create(name));
16358	1	50	if (!result) return nullptr;
16359
16360	1	50	result->load(data, cache);
		0
16361			} catch (binary_decoder_error&) {
16362			return nullptr;
16363			}
16364
16365	1	50	return result && data.is_end() ? result.release() : nullptr;
		50
16366			}
16367
16368	1		parser* parser::create(const string& name) {
16369	1	50	if (name == "nn") return new parser_nn(false);
16370	0	0	if (name == "nn_versioned") return new parser_nn(true);
16371			return nullptr;
16372			}
16373
16374			} // namespace parsito
16375
16376			/////////
16377			// File: parsito/parser/parser_nn.cpp
16378			/////////
16379
16380			// This file is part of Parsito .
16381			//
16382			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16383			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16384			//
16385			// This Source Code Form is subject to the terms of the Mozilla Public
16386			// License, v. 2.0. If a copy of the MPL was not distributed with this
16387			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16388
16389			namespace parsito {
16390
16391			// Versions:
16392			// 1: initial version
16393			// 2: add ReLU activation function
16394
16395	1		parser_nn::parser_nn(bool versioned) : versioned(versioned) {}
16396
16397	1		void parser_nn::parse(tree& t, unsigned beam_size, double* cost) const {
16398	1	50	if (beam_size > 1)
16399	1		parse_beam_search(t, beam_size, cost);
16400			else
16401	0	0	parse_greedy(t, cost);
16402	1		}
16403
16404	0		void parser_nn::parse_greedy(tree& t, double* cost) const {
16405	0	0	assert(system);
16406	0	0	if (cost) *cost = 0.;
16407
16408			// Retrieve or create workspace
16409	0		workspace* w = workspaces.pop();
16410	0	0	if (!w) w = new workspace(single_root);
16411
16412			// Create configuration
16413	0		w->conf.init(&t);
16414
16415			// Compute embeddings of all nodes
16416	0	0	if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size());
16417	0	0	for (size_t i = 0; i < t.nodes.size(); i++) {
16418	0	0	if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size());
16419	0	0	for (size_t j = 0; j < embeddings.size(); j++) {
16420	0		values[j].extract(t.nodes[i], w->word);
16421	0		w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer);
16422			}
16423			}
16424
16425			// Compute which transitions to perform and perform them
16426			int transitions = 0;
16427	0	0	for (; !w->conf.final(); transitions++) {
16428			// Extract nodes from the configuration
16429	0		nodes.extract(w->conf, w->extracted_nodes);
16430	0		w->extracted_embeddings.resize(w->extracted_nodes.size());
16431	0	0	for (size_t i = 0; i < w->extracted_nodes.size(); i++)
16432	0	0	w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr;
16433
16434			// Classify using neural network
16435	0		network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache, cost ? true : false);
16436
16437			// Find most probable applicable transition
16438			int best = -1;
16439	0	0	for (unsigned i = 0; i < w->outcomes.size(); i++)
16440	0	0	if (system->applicable(w->conf, i) && (best < 0 \|\| w->outcomes[i] > w->outcomes[best]))
		0
		0
		0
16441	0		best = i;
16442
16443			// Perform the best transition
16444	0		int child = system->perform(w->conf, best);
16445	0	0	if (cost) *cost += log(w->outcomes[best]);
16446
16447			// If a node was linked, recompute its embeddings as deprel has changed
16448	0	0	if (child >= 0)
16449	0	0	for (size_t i = 0; i < embeddings.size(); i++) {
16450	0		values[i].extract(t.nodes[child], w->word);
16451	0		w->embeddings[child][i] = embeddings[i].lookup_word(w->word, w->word_buffer);
16452			}
16453			}
16454
16455	0	0	if (cost && transitions)
16456	0		cost = cost / transitions * (t.nodes.size() - 1);
16457
16458			// Store workspace
16459	0		workspaces.push(w);
16460	0		}
16461
16462	1		void parser_nn::parse_beam_search(tree& t, unsigned beam_size, double* cost) const {
16463	1	50	assert(system);
16464
16465			// Retrieve or create workspace
16466	1		workspace* w = workspaces.pop();
16467	1	50	if (!w) w = new workspace(single_root);
16468
16469			// Allocate and initialize configuration
16470	3	100	for (int i = 0; i < 2; i++) {
16471	12	100	while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root);
16472	2	50	while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back();
16473	2		w->bs_confs_size[i] = 0;
16474			}
16475	1		w->bs_confs[0][0].cost = 0;
16476	1		w->bs_confs[0][0].conf.init(&t);
16477	1		w->bs_confs[0][0].save_tree();
16478	1		w->bs_confs_size[0] = 1;
16479
16480			// Compute embeddings of all nodes
16481	1	50	if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size());
16482	1	50	if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size());
16483	9	100	for (size_t i = 0; i < t.nodes.size(); i++) {
16484	8	50	if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size());
16485	8	50	if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size());
16486	40	100	for (size_t j = 0; j < embeddings.size(); j++) {
16487	32		values[j].extract(t.nodes[i], w->embeddings_values[i][j]);
16488	32		w->embeddings[i][j] = embeddings[j].lookup_word(w->embeddings_values[i][j], w->word_buffer);
16489			}
16490			}
16491
16492			// Compute which transitions to perform and perform them
16493			size_t iteration = 0;
16494	16	100	for (bool all_final = false; !all_final; iteration++) {
16495			all_final = true;
16496			w->bs_alternatives.clear();
16497
16498	82	100	for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) {
16499	67		auto& bs_conf = w->bs_confs[iteration & 1][c];
16500
16501	67	100	if (bs_conf.conf.final()) {
16502	5	50	if (w->bs_alternatives.size() == beam_size) {
16503	0	0	if (bs_conf.cost <= w->bs_alternatives[0].cost) continue;
16504			pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16505			w->bs_alternatives.pop_back();
16506			}
16507	5		w->bs_alternatives.emplace_back(&bs_conf, -1, bs_conf.cost);
16508	5		push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16509	5		continue;
16510			}
16511			all_final = false;
16512
16513	62		bs_conf.refresh_tree();
16514			// Update embeddings for all nodes
16515	558	100	for (size_t i = 0; i < t.nodes.size(); i++)
16516	2480	100	for (size_t j = 0; j < embeddings.size(); j++) {
16517	1984		values[j].extract(t.nodes[i], w->word);
16518	1984	100	if (w->word != w->embeddings_values[i][j]) {
16519	96		w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer);
16520			w->embeddings_values[i][j].assign(w->word);
16521			}
16522			}
16523
16524			// Extract nodes from the configuration
16525	62		nodes.extract(bs_conf.conf, w->extracted_nodes);
16526	62		w->extracted_embeddings.resize(w->extracted_nodes.size());
16527	1178	100	for (size_t i = 0; i < w->extracted_nodes.size(); i++)
16528	1116	100	w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr;
16529
16530			// Classify using neural network
16531	62		network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache);
16532
16533			// Store all alternatives
16534	868	100	for (unsigned i = 0; i < w->outcomes.size(); i++)
16535	806	100	if (system->applicable(bs_conf.conf, i)) {
16536	1899		double cost = (bs_conf.cost * iteration + log(w->outcomes[i])) / (iteration + 1);
16537	633	100	if (w->bs_alternatives.size() == beam_size) {
16538	567	100	if (cost <= w->bs_alternatives[0].cost) continue;
16539			pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16540			w->bs_alternatives.pop_back();
16541			}
16542	236		w->bs_alternatives.emplace_back(&bs_conf, i, cost);
16543	236		push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end());
16544			}
16545			}
16546
16547	15		w->bs_confs_size[(iteration + 1) & 1] = 0;
16548	86	100	for (auto&& alternative : w->bs_alternatives) {
16549	71		auto& bs_conf_new = w->bs_confs[(iteration + 1) & 1][w->bs_confs_size[(iteration + 1) & 1]++];
16550	71		bs_conf_new = *alternative.bs_conf;
16551	71		bs_conf_new.cost = alternative.cost;
16552	71	100	if (alternative.transition >= 0) {
16553	66		bs_conf_new.refresh_tree();
16554	66		system->perform(bs_conf_new.conf, alternative.transition);
16555	66		bs_conf_new.save_tree();
16556			}
16557			}
16558			}
16559
16560			// Return the best tree
16561			size_t best = 0;
16562	5	100	for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++)
16563	4	100	if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost)
16564			best = i;
16565	1		w->bs_confs[iteration & 1][best].refresh_tree();
16566
16567	1	50	if (cost) cost = w->bs_confs[iteration & 1][best].cost (t.nodes.size() - 1);
16568
16569			// Store workspace
16570	1		workspaces.push(w);
16571	1		}
16572
16573	129		void parser_nn::workspace::beam_size_configuration::refresh_tree() {
16574	1161	100	for (auto&& node : conf.t->nodes) node.children.clear();
16575	1161	100	for (size_t i = 0; i < conf.t->nodes.size(); i++) {
16576	1032		conf.t->nodes[i].head = heads[i];
16577	2064		conf.t->nodes[i].deprel = deprels[i];
16578	1334	100	if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i);
16579			}
16580	129		}
16581
16582	67		void parser_nn::workspace::beam_size_configuration::save_tree() {
16583	67	100	if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size());
16584	67	100	if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size());
16585	603	100	for (size_t i = 0; i < conf.t->nodes.size(); i++) {
16586	536		heads[i] = conf.t->nodes[i].head;
16587	1072		deprels[i] = conf.t->nodes[i].deprel;
16588			}
16589	67		}
16590
16591	1		void parser_nn::load(binary_decoder& data, unsigned cache) {
16592			string description, error;
16593
16594	1	50	version = versioned ? data.next_1B() : 1;
		0
16595	1	50	if (!(version >= 1 && version <= VERSION_LATEST))
16596	0		throw binary_decoder_error("Unrecognized version of the parser_nn model");
16597
16598	1	50	single_root = version >= 2 ? data.next_1B() : false;
		0
		0
16599
16600			// Load labels
16601	1	50	labels.resize(data.next_2B());
		50
16602	7	100	for (auto&& label : labels)
16603	6	50	data.next_str(label);
16604
16605			// Load transition system
16606			string system_name;
16607	1	50	data.next_str(system_name);
16608	1	50	system.reset(transition_system::create(system_name, labels));
16609	1	50	if (!system) throw binary_decoder_error("Cannot load transition system");
16610
16611			// Load node extractor
16612	1	50	data.next_str(description);
16613	1	50	if (!nodes.create(description, error))
		50
16614	0		throw binary_decoder_error(error.c_str());
16615
16616			// Load value extractors and embeddings
16617	1	50	values.resize(data.next_2B());
		50
16618	5	100	for (auto&& value : values) {
16619	4	50	data.next_str(description);
16620	4	50	if (!value.create(description, error))
		50
16621	0		throw binary_decoder_error(error.c_str());
16622			}
16623
16624	1	50	embeddings.resize(values.size());
16625	5	100	for (auto&& embedding : embeddings)
16626	4	50	embedding.load(data);
16627
16628			// Load the network
16629	1	50	network.load(data);
16630	1	50	network.generate_tanh_cache();
16631	1	50	network.generate_embeddings_cache(embeddings, embeddings_cache, cache);
16632	1		}
16633
16634			} // namespace parsito
16635
16636			/////////
16637			// File: parsito/parser/parser_nn_trainer.h
16638			/////////
16639
16640			// This file is part of Parsito .
16641			//
16642			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16643			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16644			//
16645			// This Source Code Form is subject to the terms of the Mozilla Public
16646			// License, v. 2.0. If a copy of the MPL was not distributed with this
16647			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16648
16649			namespace parsito {
16650
16651			class parser_nn_trainer {
16652			public:
16653			static void train(const string& transition_system_name, const string& transition_oracle_name, bool single_root,
16654			const string& embeddings_description, const string& nodes_description, const network_parameters& parameters,
16655			unsigned number_of_threads, const vector& train, const vector& heldout, binary_encoder& enc);
16656			};
16657
16658			} // namespace parsito
16659
16660			/////////
16661			// File: parsito/parser/parser_nn_trainer.cpp
16662			/////////
16663
16664			// This file is part of Parsito .
16665			//
16666			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
16667			// Mathematics and Physics, Charles University in Prague, Czech Republic.
16668			//
16669			// This Source Code Form is subject to the terms of the Mozilla Public
16670			// License, v. 2.0. If a copy of the MPL was not distributed with this
16671			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
16672
16673			namespace parsito {
16674
16675	0		void parser_nn_trainer::train(const string& transition_system_name, const string& transition_oracle_name, bool single_root,
16676			const string& embeddings_description, const string& nodes_description, const network_parameters& parameters,
16677			unsigned /number_of_threads/, const vector& train, const vector& heldout, binary_encoder& enc) {
16678	0	0	if (train.empty()) training_failure("No training data was given!");
		0
		0
16679
16680			// Random generator with fixed seed for reproducibility
16681			mt19937 generator(42);
16682
16683			// Check that all non-root nodes have heads and nonempty deprel
16684	0	0	for (auto&& tree : train)
16685	0	0	for (auto&& node : tree.nodes)
16686	0	0	if (node.id) {
16687	0	0	if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!");
		0
		0
		0
16688	0	0	if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!");
		0
		0
		0
16689			}
16690
16691			// Create parser instance to be trained
16692	0		parser_nn parser(true); parser.version = parser_nn::VERSION_LATEST;
16693
16694			// Generate labels for transition system
16695			unordered_set labels_set;
16696	0	0	for (auto&& tree : train)
16697	0	0	for (auto&& node : tree.nodes)
16698	0	0	if (node.id && !labels_set.count(node.deprel)) {
16699	0		labels_set.insert(node.deprel);
16700	0	0	parser.labels.push_back(node.deprel);
16701			}
16702
16703			// If single_root, check that exactly root nodes have "root" deprel
16704	0	0	if (single_root) {
16705	0	0	for (auto&& tree : train) {
16706			unsigned roots = 0;
16707	0	0	for (auto&& node : tree.nodes)
16708	0	0	if (node.id) {
16709	0	0	if (node.head == 0 && node.deprel != "root")
		0
		0
16710	0	0	training_failure("When single root is required, every root node must have 'root' deprel!");
		0
16711	0	0	if (node.head != 0 && node.deprel == "root")
		0
		0
16712	0	0	training_failure("When single root is required, any non-root cannot have 'root' deprel!");
		0
16713	0		roots += node.head == 0;
16714			}
16715	0	0	if (roots != 1)
16716	0	0	training_failure("When single root is required, every training tree must have single root!");
		0
16717			}
16718
16719			// Make sure (in case input is really small) there is "root" deprel plus another one
16720	0	0	if (!labels_set.count("root"))
		0
16721	0	0	training_failure("When single root is required, the deprel 'root' must be present!");
		0
16722	0	0	if (labels_set.size() <= 1)
16723	0	0	training_failure("When single root is required, deprel different from 'root' must exist!");
		0
16724			}
16725
16726			// Create transition system and transition oracle
16727	0	0	parser.system.reset(transition_system::create(transition_system_name, parser.labels));
16728	0	0	if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!");
		0
		0
16729
16730	0	0	unique_ptr oracle(parser.system->oracle(transition_oracle_name));
16731	0	0	if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!");
		0
		0
16732
16733			// Create node_extractor
16734			string error;
16735	0	0	if (!parser.nodes.create(nodes_description, error)) training_failure(error);
		0
		0
16736
16737			// Load value_extractors and embeddings
16738	0		vector value_names;
16739			vector lines, tokens;
16740	0	0	split(embeddings_description, '\n', lines);
16741	0	0	for (auto&& line : lines) {
16742			// Ignore empty lines and comments
16743	0	0	if (!line.len \|\| line.str[0] == '#') continue;
		0
16744
16745	0	0	split(line, ' ', tokens);
16746	0	0	if (!(tokens.size() >= 3 && tokens.size() <= 6))
		0
		0
16747	0	0	training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!");
		0
16748
16749	0	0	value_names.emplace_back(string(tokens[0].str, tokens[0].len));
16750	0	0	parser.values.emplace_back();
16751	0	0	if (!parser.values.back().create(tokens[0], error)) training_failure(error);
		0
		0
16752
16753	0	0	int dimension = parse_int(tokens[1], "embedding dimension");
16754	0	0	int min_count = parse_int(tokens[2], "minimum frequency count");
16755			unsigned updatable_index = 0;
16756			unsigned embeddings_from_file = 0;
16757			string embeddings_from_file_comment;
16758	0		vector>> weights;
16759			unordered_set weights_set;
16760
16761			// Compute words and counts present in the training data
16762			string word;
16763			unordered_map word_counts;
16764	0	0	for (auto&& tree : train)
16765	0	0	for (auto&& node : tree.nodes)
16766	0	0	if (node.id) {
16767	0	0	parser.values.back().extract(node, word);
16768	0		word_counts[word]++;
16769			}
16770
16771			// Load embedding if it was given
16772	0	0	if (tokens.size() >= 4) {
16773	0	0	int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1;
		0
16774	0	0	int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max();
		0
16775	0	0	ifstream in(path_from_utf8(string(tokens[3].str, tokens[3].len)).c_str());
16776	0	0	if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!");
		0
		0
16777
16778			// Load first line containing dictionary size and dimensions
16779			string line;
16780			vector parts;
16781	0	0	if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!");
		0
		0
		0
16782	0	0	split(line, ' ', parts);
16783	0	0	if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!");
		0
		0
16784	0	0	int file_dimension = parse_int(parts[1], "embedding file dimension");
16785
16786	0	0	if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!");
		0
		0
16787
16788			// Generate random projection when smaller dimension is required
16789	0		vector> projection;
16790	0	0	if (file_dimension > dimension) {
16791	0	0	embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]";
		0
		0
		0
16792
16793			uniform_real_distribution uniform(0, 1);
16794	0	0	projection.resize(dimension);
16795	0	0	for (auto&& row : projection) {
16796	0	0	row.resize(file_dimension);
16797	0	0	for (auto&& weight : row) weight = uniform(generator);
16798
16799			double sum = 0;
16800	0	0	for (auto&& weight : row) sum += weight;
16801	0	0	for (auto&& weight : row) weight /= sum;
16802			}
16803			}
16804
16805			// Load input embedding
16806	0	0	vector input_weights(file_dimension);
16807	0	0	vector projected_weights(dimension);
16808	0	0	while (getline(in, line) && int(weights.size()) < max_embeddings) {
		0
		0
		0
16809	0	0	split(line, ' ', parts);
16810	0	0	if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line
		0
		0
16811	0	0	if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]);
		0
16812	0	0	for (int i = 0; i < file_dimension; i++)
16813	0	0	input_weights[i] = parse_double(parts[1 + i], "embedding weight");
16814
16815	0		string word(parts[0].str, parts[0].len);
16816
16817			// For update_weights == 2, ignore embeddings for unknown words
16818	0	0	if (update_weights == 2 && !word_counts.count(word))
16819			continue;
16820
16821	0	0	for (int i = 0; i < dimension; i++)
16822	0	0	if (file_dimension == dimension) {
16823	0		projected_weights[i] = input_weights[i];
16824			} else {
16825	0		projected_weights[i] = 0;
16826	0	0	for (int j = 0; j < file_dimension; j++)
16827	0		projected_weights[i] += projection[i][j] * input_weights[j];
16828			}
16829
16830	0	0	if (!weights_set.count(word)) {
16831	0	0	weights.emplace_back(word, projected_weights);
16832			weights_set.insert(word);
16833			}
16834			}
16835	0		embeddings_from_file = weights.size();
16836	0	0	updatable_index = update_weights ? 0 : embeddings_from_file;
16837			}
16838
16839			// Add embedding for non-present word with min_count, sorted by count
16840			{
16841	0		vector> count_words;
16842	0	0	for (auto&& word_count : word_counts)
16843	0	0	if (word_count.second >= min_count && !weights_set.count(word_count.first))
16844	0	0	count_words.emplace_back(word_count.second, word_count.first);
16845
16846			sort(count_words.rbegin(), count_words.rend());
16847
16848	0	0	vector word_weights(dimension);
16849			uniform_real_distribution uniform(-1, 1);
16850	0	0	for (auto&& count_word : count_words) {
16851	0	0	for (auto&& word_weight : word_weights)
16852	0		word_weight = uniform(generator);
16853
16854	0	0	weights.emplace_back(count_word.second, word_weights);
16855			}
16856			}
16857
16858			// If there are unknown words in the training data, create initial embedding
16859	0	0	vector unknown_weights(dimension);
16860	0	0	if (min_count > 1) {
16861			uniform_real_distribution uniform(-1, 1);
16862
16863	0	0	for (auto&& weight : unknown_weights)
16864	0		weight = uniform(generator);
16865			}
16866
16867			// Add the embedding
16868	0	0	parser.embeddings.emplace_back();
16869	0	0	parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights);
16870
16871			// Count the cover of this embedding
16872			string buffer;
16873			unsigned words_total = 0, words_covered = 0, words_covered_from_file = 0;
16874	0	0	for (auto&& tree : train)
16875	0	0	for (auto&& node : tree.nodes)
16876	0	0	if (node.id) {
16877	0	0	parser.values.back().extract(node, word);
16878	0		words_total++;
16879	0	0	int word_id = parser.embeddings.back().lookup_word(word, buffer);
16880	0		words_covered += word_id != parser.embeddings.back().unknown_word();
16881	0	0	words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file;
		0
16882			}
16883
16884			cerr << "Initialized '" << tokens[0] << "' embedding with " << embeddings_from_file << embeddings_from_file_comment
16885	0		<< "," << weights.size() << " words and " << fixed << setprecision(1) << 100. * words_covered_from_file / words_total
16886	0		<< "%," << 100. * words_covered / words_total << "% coverage." << endl;
16887			}
16888
16889			// Train the network
16890			unsigned total_dimension = 0, total_nodes = 0;
16891	0	0	for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension;
16892	0	0	for (auto&& tree : train) total_nodes += tree.nodes.size() - 1;
16893	0		auto scaled_parameters = parameters;
16894	0		scaled_parameters.l1_regularization /= train.size();
16895	0		scaled_parameters.l2_regularization /= total_nodes;
16896	0	0	neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator);
		0
16897
16898	0		neural_network heldout_best_network;
16899			unsigned heldout_best_correct_labelled = 0, heldout_best_iteration = 0;
16900
16901			vector permutation;
16902	0	0	for (size_t i = 0; i < train.size(); i++)
16903	0		permutation.push_back(permutation.size());
16904
16905	0	0	for (int iteration = 1; network_trainer.next_iteration(); iteration++) {
16906			// Train on training data
16907	0		shuffle(permutation.begin(), permutation.end(), generator);
16908
16909	0		atomic atomic_index(0);
16910	0		atomic atomic_logprob(0);
16911	0		auto training = [&]() {
16912	0		tree t;
16913	0		configuration conf(single_root);
16914			string word, word_buffer;
16915	0		vector> nodes_embeddings;
16916			vector extracted_nodes;
16917			vector*> extracted_embeddings;
16918	0		neural_network_trainer::workspace workspace;
16919			double logprob = 0;
16920
16921			// Data for structured prediction
16922	0	0	tree t_eval;
16923	0		configuration conf_eval(single_root);
16924	0		vector> nodes_embeddings_eval;
16925			vector extracted_nodes_eval;
16926			vector*> extracted_embeddings_eval;
16927			vector transitions_eval;
16928			vector hidden_layer_eval, outcomes_eval;
16929
16930	0	0	for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) {
16931	0		const tree& gold = train[permutation[current_index]];
16932			t = gold;
16933			t.unlink_all_nodes();
16934	0	0	conf.init(&t);
16935
16936			// Compute embeddings
16937	0	0	if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
		0
16938	0	0	for (size_t i = 0; i < t.nodes.size(); i++) {
16939	0	0	nodes_embeddings[i].resize(parser.embeddings.size());
16940	0	0	for (size_t j = 0; j < parser.embeddings.size(); j++) {
16941	0	0	parser.values[j].extract(t.nodes[i], word);
16942	0	0	nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer);
16943			}
16944			}
16945
16946			// Create tree oracle
16947	0	0	auto tree_oracle = oracle->create_tree_oracle(gold);
16948
16949			// Train the network
16950	0	0	while (!conf.final()) {
16951			// Extract nodes
16952	0	0	parser.nodes.extract(conf, extracted_nodes);
16953	0	0	extracted_embeddings.resize(extracted_nodes.size());
16954	0	0	for (size_t i = 0; i < extracted_nodes.size(); i++)
16955	0	0	extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr;
16956
16957			// Propagate
16958	0	0	network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace);
16959
16960			// Find most probable applicable transition
16961			int network_best = -1;
16962	0	0	for (unsigned i = 0; i < workspace.outcomes.size(); i++)
16963	0	0	if (parser.system->applicable(conf, i) && (network_best < 0 \|\| workspace.outcomes[i] > workspace.outcomes[network_best]))
		0
		0
		0
		0
16964	0		network_best = i;
16965
16966			// Apply the oracle
16967	0	0	auto prediction = tree_oracle->predict(conf, network_best, iteration);
16968
16969			// If the best transition is applicable, train on it
16970	0	0	if (parser.system->applicable(conf, prediction.best)) {
		0
16971			// Update logprob
16972	0	0	if (workspace.outcomes[prediction.best])
16973	0		logprob += log(workspace.outcomes[prediction.best]);
16974
16975			// Backpropagate the chosen outcome
16976	0	0	network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace);
16977			}
16978
16979			// Emergency break if the to_follow transition is not applicable
16980	0	0	if (!parser.system->applicable(conf, prediction.to_follow))
		0
16981			break;
16982
16983			// Follow the chosen outcome
16984	0	0	int child = parser.system->perform(conf, prediction.to_follow);
16985
16986			// If a node was linked, recompute its embeddings as deprel has changed
16987	0	0	if (child >= 0)
16988	0	0	for (size_t i = 0; i < parser.embeddings.size(); i++) {
16989	0	0	parser.values[i].extract(t.nodes[child], word);
16990	0	0	nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
16991			}
16992			}
16993			network_trainer.finalize_sentence();
16994
16995			// Structured prediction
16996	0	0	if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) {
		0
16997	0		uniform_int_distribution train_distribution(0, train.size() - 1);
16998	0		const tree& gold = train[train_distribution(generator)];
16999			t = gold;
17000			t.unlink_all_nodes();
17001	0	0	conf.init(&t);
17002
17003			// Compute embeddings
17004	0	0	if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size());
		0
17005	0	0	for (size_t i = 0; i < t.nodes.size(); i++) {
17006	0	0	nodes_embeddings[i].resize(parser.embeddings.size());
17007	0	0	for (size_t j = 0; j < parser.embeddings.size(); j++) {
17008	0	0	parser.values[j].extract(t.nodes[i], word);
17009	0	0	nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer);
17010			}
17011			}
17012
17013			// Create tree oracle
17014	0	0	auto tree_oracle = oracle->create_tree_oracle(gold);
17015
17016			// Train the network
17017	0	0	while (!conf.final()) {
17018			// Extract nodes
17019	0	0	parser.nodes.extract(conf, extracted_nodes);
17020	0	0	extracted_embeddings.resize(extracted_nodes.size());
17021	0	0	for (size_t i = 0; i < extracted_nodes.size(); i++)
17022	0	0	extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr;
17023
17024			// Find the best transition
17025			int best = 0;
17026			int best_uas = -1;
17027	0	0	tree_oracle->interesting_transitions(conf, transitions_eval);
17028	0	0	for (auto&& transition : transitions_eval) {
17029			t_eval = t;
17030	0	0	conf_eval = conf;
17031	0		conf_eval.t = &t_eval;
17032	0	0	nodes_embeddings_eval = nodes_embeddings;
17033
17034			// Perform probed transition
17035	0	0	int child = parser.system->perform(conf_eval, transition);
17036	0	0	if (child >= 0)
17037	0	0	for (size_t i = 0; i < parser.embeddings.size(); i++) {
17038	0	0	parser.values[i].extract(t_eval.nodes[child], word);
17039	0	0	nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
17040			}
17041
17042			// Train the network
17043	0	0	while (!conf_eval.final()) {
17044			// Extract nodes
17045	0	0	parser.nodes.extract(conf_eval, extracted_nodes_eval);
17046	0	0	extracted_embeddings_eval.resize(extracted_nodes_eval.size());
17047	0	0	for (size_t i = 0; i < extracted_nodes_eval.size(); i++)
17048	0	0	extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr;
17049
17050			// Classify using neural network
17051	0	0	parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false);
17052
17053			// Find most probable applicable transition
17054			int network_best = -1;
17055	0	0	for (unsigned i = 0; i < outcomes_eval.size(); i++)
17056	0	0	if (parser.system->applicable(conf_eval, i) && (network_best < 0 \|\| outcomes_eval[i] > outcomes_eval[network_best]))
		0
		0
		0
		0
17057	0		network_best = i;
17058
17059			// Perform the best transition
17060	0	0	int child = parser.system->perform(conf_eval, network_best);
17061
17062			// If a node was linked, recompute its embeddings as deprel has changed
17063	0	0	if (child >= 0)
17064	0	0	for (size_t i = 0; i < parser.embeddings.size(); i++) {
17065	0	0	parser.values[i].extract(t_eval.nodes[child], word);
17066	0	0	nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
17067			}
17068			}
17069
17070			int uas = 0;
17071	0	0	for (unsigned i = 1; i < gold.nodes.size(); i++)
17072	0		uas += gold.nodes[i].head == t_eval.nodes[i].head;
17073
17074	0	0	if (uas > best_uas) best = transition, best_uas = uas;
17075			}
17076
17077			// Propagate
17078	0	0	network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace);
17079
17080			// Backpropagate for the best transition
17081	0	0	if (workspace.outcomes[best])
17082	0		logprob += log(workspace.outcomes[best]);
17083	0	0	network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace);
17084
17085			// // Find most probable applicable transition when following network outcome
17086			// int network_best = -1;
17087			// for (unsigned i = 0; i < workspace.outcomes.size(); i++)
17088			// if (parser.system->applicable(conf, i) && (network_best < 0 \|\| workspace.outcomes[i] > workspace.outcomes[network_best]))
17089			// network_best = i;
17090
17091			// Follow the best outcome
17092	0	0	int child = parser.system->perform(conf, /network_/best);
17093
17094			// If a node was linked, recompute its embeddings as deprel has changed
17095	0	0	if (child >= 0)
17096	0	0	for (size_t i = 0; i < parser.embeddings.size(); i++) {
17097	0	0	parser.values[i].extract(t.nodes[child], word);
17098	0	0	nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer);
17099			}
17100			}
17101			network_trainer.finalize_sentence();
17102			}
17103			}
17104	0	0	for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {}
17105	0		};
17106
17107	0	0	cerr << "Iteration " << iteration << ": ";
		0
17108	0	0	training();
17109			cerr << "training logprob " << scientific << setprecision(4) << atomic_logprob;
17110
17111			// Evaluate heldout data if present
17112	0	0	if (!heldout.empty()) {
17113	0	0	tree t;
17114			unsigned total = 0, correct_unlabelled = 0, correct_labelled = 0;
17115	0	0	for (auto&& gold : heldout) {
17116			t = gold;
17117			t.unlink_all_nodes();
17118			parser.parse(t);
17119	0	0	for (size_t i = 1; i < t.nodes.size(); i++) {
17120	0		total++;
17121	0		correct_unlabelled += t.nodes[i].head == gold.nodes[i].head;
17122	0	0	correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel;
		0
17123			}
17124			}
17125
17126	0	0	cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%";
17127
17128	0	0	if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) {
		0
17129	0	0	heldout_best_network = parser.network;
17130			heldout_best_correct_labelled = correct_labelled;
17131	0		heldout_best_iteration = iteration;
17132			}
17133			}
17134
17135			cerr << endl;
17136			}
17137
17138	0	0	if (parameters.early_stopping && heldout_best_iteration > 0) {
		0
17139			cerr << "Using early stopping -- choosing network from iteration " << heldout_best_iteration << endl;
17140	0	0	parser.network = heldout_best_network;
17141			}
17142
17143			// Encode version
17144	0	0	enc.add_1B(parser.version);
17145
17146			// Encode single_root
17147	0	0	enc.add_1B(single_root);
17148
17149			// Encode transition system
17150	0	0	enc.add_2B(parser.labels.size());
17151	0	0	for (auto&& label : parser.labels)
17152	0	0	enc.add_str(label);
17153	0	0	enc.add_str(transition_system_name);
17154
17155			// Encode nodes selector
17156	0	0	enc.add_str(nodes_description);
17157
17158			// Encode value extractors and embeddings
17159	0	0	enc.add_2B(value_names.size());
17160	0	0	for (auto&& value_name : value_names)
17161	0	0	enc.add_str(value_name);
17162	0	0	for (auto&& embedding : parser.embeddings)
17163	0	0	embedding.save(enc);
17164
17165			// Encode the network
17166	0	0	network_trainer.save_network(enc);
17167	0		}
17168
17169			} // namespace parsito
17170
17171			/////////
17172			// File: parsito/transition/transition.cpp
17173			/////////
17174
17175			// This file is part of Parsito .
17176			//
17177			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17178			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17179			//
17180			// This Source Code Form is subject to the terms of the Mozilla Public
17181			// License, v. 2.0. If a copy of the MPL was not distributed with this
17182			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17183
17184			namespace parsito {
17185
17186			// Left arc
17187	387		bool transition_left_arc::applicable(const configuration& conf) const {
17188	387	50	if (conf.single_root && label_is_root)
		0
17189			return false;
17190			else
17191	387	100	return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2];
		100
17192			}
17193
17194	15		int transition_left_arc::perform(configuration& conf) const {
17195	15	50	assert(applicable(conf));
17196
17197	15		int parent = conf.stack.back(); conf.stack.pop_back();
17198	15		int child = conf.stack.back(); conf.stack.pop_back();
17199	15		conf.stack.push_back(parent);
17200	15		conf.t->set_head(child, parent, label);
17201	15		return child;
17202			}
17203
17204			// Right arc
17205	395		bool transition_right_arc::applicable(const configuration& conf) const {
17206	395	50	if (conf.single_root && label_is_root)
		0
17207	0	0	return conf.stack.size() == 2 && conf.buffer.empty();
		0
17208	395	50	else if (conf.single_root) // && !label_is_root
17209	0		return conf.stack.size() > 2;
17210			else
17211	395		return conf.stack.size() >= 2;
17212			}
17213
17214	23		int transition_right_arc::perform(configuration& conf) const {
17215	23	50	assert(applicable(conf));
17216
17217	23		int child = conf.stack.back(); conf.stack.pop_back();
17218	23		int parent = conf.stack.back();
17219	23		conf.t->set_head(child, parent, label);
17220	23		return child;
17221			}
17222
17223			// Shift
17224	90		bool transition_shift::applicable(const configuration& conf) const {
17225	90		return !conf.buffer.empty();
17226			}
17227
17228	28		int transition_shift::perform(configuration& conf) const {
17229	28	50	assert(applicable(conf));
17230
17231	28		conf.stack.push_back(conf.buffer.back());
17232			conf.buffer.pop_back();
17233	28		return -1;
17234			}
17235
17236			// Swap
17237	0		bool transition_swap::applicable(const configuration& conf) const {
17238	0	0	return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1];
		0
		0
17239			}
17240
17241	0		int transition_swap::perform(configuration& conf) const {
17242	0	0	assert(applicable(conf));
17243
17244	0		int top = conf.stack.back(); conf.stack.pop_back();
17245	0		int to_buffer = conf.stack.back(); conf.stack.pop_back();
17246	0		conf.stack.push_back(top);
17247	0		conf.buffer.push_back(to_buffer);
17248	0		return -1;
17249			}
17250
17251			// Left arc 2
17252	0		bool transition_left_arc_2::applicable(const configuration& conf) const {
17253	0	0	if (conf.single_root && label_is_root)
		0
17254			return false;
17255			else
17256	0	0	return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3];
		0
17257			}
17258
17259	0		int transition_left_arc_2::perform(configuration& conf) const {
17260	0	0	assert(applicable(conf));
17261
17262	0		int parent = conf.stack.back(); conf.stack.pop_back();
17263	0		int ignore = conf.stack.back(); conf.stack.pop_back();
17264	0		int child = conf.stack.back(); conf.stack.pop_back();
17265	0		conf.stack.push_back(ignore);
17266	0		conf.stack.push_back(parent);
17267	0		conf.t->set_head(child, parent, label);
17268	0		return child;
17269			}
17270
17271			// Right arc 2
17272	0		bool transition_right_arc_2::applicable(const configuration& conf) const {
17273	0	0	if (conf.single_root && label_is_root)
		0
17274			return false;
17275	0	0	else if (conf.single_root) // && !label_is_root
17276	0		return conf.stack.size() >= 4;
17277			else
17278	0		return conf.stack.size() >= 3;
17279			}
17280
17281	0		int transition_right_arc_2::perform(configuration& conf) const {
17282	0	0	assert(applicable(conf));
17283
17284	0		int child = conf.stack.back(); conf.stack.pop_back();
17285	0		int to_buffer = conf.stack.back(); conf.stack.pop_back();
17286	0		int parent = conf.stack.back();
17287	0		conf.buffer.push_back(to_buffer);
17288	0		conf.t->set_head(child, parent, label);
17289	0		return child;
17290			}
17291
17292			} // namespace parsito
17293
17294			/////////
17295			// File: parsito/transition/transition_system_link2.h
17296			/////////
17297
17298			// This file is part of Parsito .
17299			//
17300			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17301			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17302			//
17303			// This Source Code Form is subject to the terms of the Mozilla Public
17304			// License, v. 2.0. If a copy of the MPL was not distributed with this
17305			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17306
17307			namespace parsito {
17308
17309	0		class transition_system_link2 : public transition_system {
17310			public:
17311			transition_system_link2(const vector& labels);
17312
17313			virtual transition_oracle* oracle(const string& name) const override;
17314			};
17315
17316			} // namespace parsito
17317
17318			/////////
17319			// File: parsito/transition/transition_system_projective.h
17320			/////////
17321
17322			// This file is part of Parsito .
17323			//
17324			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17325			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17326			//
17327			// This Source Code Form is subject to the terms of the Mozilla Public
17328			// License, v. 2.0. If a copy of the MPL was not distributed with this
17329			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17330
17331			namespace parsito {
17332
17333	2		class transition_system_projective : public transition_system {
17334			public:
17335			transition_system_projective(const vector& labels);
17336
17337			virtual transition_oracle* oracle(const string& name) const override;
17338			};
17339
17340			} // namespace parsito
17341
17342			/////////
17343			// File: parsito/transition/transition_system_swap.h
17344			/////////
17345
17346			// This file is part of Parsito .
17347			//
17348			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17349			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17350			//
17351			// This Source Code Form is subject to the terms of the Mozilla Public
17352			// License, v. 2.0. If a copy of the MPL was not distributed with this
17353			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17354
17355			namespace parsito {
17356
17357	0		class transition_system_swap : public transition_system {
17358			public:
17359			transition_system_swap(const vector& labels);
17360
17361			virtual transition_oracle* oracle(const string& name) const override;
17362			};
17363
17364			} // namespace parsito
17365
17366			/////////
17367			// File: parsito/transition/transition_system.cpp
17368			/////////
17369
17370			// This file is part of Parsito .
17371			//
17372			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17373			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17374			//
17375			// This Source Code Form is subject to the terms of the Mozilla Public
17376			// License, v. 2.0. If a copy of the MPL was not distributed with this
17377			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17378
17379			namespace parsito {
17380
17381	0		unsigned transition_system::transition_count() const {
17382	0		return transitions.size();
17383			}
17384
17385	806		bool transition_system::applicable(const configuration& conf, unsigned transition) const {
17386	806	50	assert(transition < transitions.size());
17387
17388	806		return transitions[transition]->applicable(conf);
17389			}
17390
17391	66		int transition_system::perform(configuration& conf, unsigned transition) const {
17392	66	50	assert(transition < transitions.size());
17393
17394	66		return transitions[transition]->perform(conf);
17395			}
17396
17397	1		transition_system* transition_system::create(const string& name, const vector& labels) {
17398	1	50	if (name == "projective") return new transition_system_projective(labels);
		50
17399	0	0	if (name == "swap") return new transition_system_swap(labels);
		0
17400	1	0	if (name == "link2") return new transition_system_link2(labels);
		0
17401			return nullptr;
17402			}
17403
17404			} // namespace parsito
17405
17406			/////////
17407			// File: parsito/transition/transition_system_link2.cpp
17408			/////////
17409
17410			// This file is part of Parsito .
17411			//
17412			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17413			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17414			//
17415			// This Source Code Form is subject to the terms of the Mozilla Public
17416			// License, v. 2.0. If a copy of the MPL was not distributed with this
17417			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17418
17419			namespace parsito {
17420
17421	0		transition_system_link2::transition_system_link2(const vector& labels) : transition_system(labels) {
17422	0	0	transitions.emplace_back(new transition_shift());
		0
17423	0	0	for (auto&& label : labels) {
17424	0	0	transitions.emplace_back(new transition_left_arc(label));
		0
		0
17425	0	0	transitions.emplace_back(new transition_right_arc(label));
		0
		0
17426	0	0	transitions.emplace_back(new transition_left_arc_2(label));
		0
		0
17427	0	0	transitions.emplace_back(new transition_right_arc_2(label));
		0
		0
17428			}
17429	0		}
17430
17431			// Static oracle
17432	0		class transition_system_link2_oracle_static : public transition_oracle {
17433			public:
17434	0		transition_system_link2_oracle_static(const vector& labels) : labels(labels) {
17435	0	0	for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
		0
17436	0		}
17437
17438	0		class tree_oracle_static : public transition_oracle::tree_oracle {
17439			public:
17440	0		tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {}
17441			virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17442			virtual void interesting_transitions(const configuration& conf, vector& transitions) const override;
17443			private:
17444			const vector& labels;
17445			unsigned root_label;
17446			const tree& gold;
17447			};
17448
17449			virtual unique_ptr create_tree_oracle(const tree& gold) const override;
17450			private:
17451			const vector& labels;
17452			unsigned root_label;
17453			};
17454
17455	0		unique_ptr transition_system_link2_oracle_static::create_tree_oracle(const tree& gold) const {
17456	0		return unique_ptr(new tree_oracle_static(labels, root_label, gold));
17457			}
17458
17459	0		void transition_system_link2_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const {
17460			transitions.clear();
17461
17462			// Shift
17463	0	0	if (!conf.buffer.empty()) transitions.push_back(0);
17464
17465			// Arcs
17466	0		unsigned parents[4] = {1, 2, 1, 3};
17467	0		unsigned children[4] = {2, 1, 3, 1};
17468	0	0	for (int direction = 0; direction < 4; direction++)
17469	0	0	if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
		0
		0
17470	0		int parent = conf.stack[conf.stack.size() - parents[direction]];
17471	0		int child = conf.stack[conf.stack.size() - children[direction]];
17472
17473			// Allow arc_2 only when seeing golden edge.
17474	0	0	if (direction >= 2 && gold.nodes[child].head != parent) continue;
		0
		0
17475
17476	0	0	for (size_t i = 0; i < labels.size(); i++)
17477	0	0	if (gold.nodes[child].deprel == labels[i])
17478	0	0	if (!conf.single_root \|\|
		0
17479	0	0	(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) \|\|
		0
		0
		0
17480	0	0	(i != root_label && conf.stack.size() > 2 && direction < 2) \|\|
		0
		0
		0
17481	0	0	(i != root_label && conf.stack.size() > 3 && direction >= 2))
		0
17482	0		transitions.push_back(1 + 4*i + direction);
17483			}
17484	0		}
17485
17486	0		transition_oracle::predicted_transition transition_system_link2_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /network_outcome/, unsigned /iteration/) const {
17487			// Arcs
17488	0		unsigned parents[4] = {1, 2, 1, 3};
17489	0		unsigned children[4] = {2, 1, 3, 1};
17490	0	0	for (int direction = 0; direction < 4; direction++)
17491	0	0	if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) {
		0
		0
17492	0		int parent = conf.stack[conf.stack.size() - parents[direction]];
17493	0		int child = conf.stack[conf.stack.size() - children[direction]];
17494
17495	0	0	if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
		0
		0
17496	0	0	for (size_t i = 0; i < labels.size(); i++)
17497	0	0	if (gold.nodes[child].deprel == labels[i])
17498	0		return predicted_transition(1 + 4i + direction, 1 + 4i + direction);
17499
17500	0		assert(!"label was not found");
17501			}
17502			}
17503
17504			// Otherwise, just shift
17505	0		return predicted_transition(0, 0);
17506			}
17507
17508			// Oracle factory method
17509	0		transition_oracle* transition_system_link2::oracle(const string& name) const {
17510	0	0	if (name == "static") return new transition_system_link2_oracle_static(labels);
		0
17511			return nullptr;
17512			}
17513
17514			} // namespace parsito
17515
17516			/////////
17517			// File: parsito/transition/transition_system_projective.cpp
17518			/////////
17519
17520			// This file is part of Parsito .
17521			//
17522			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17523			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17524			//
17525			// This Source Code Form is subject to the terms of the Mozilla Public
17526			// License, v. 2.0. If a copy of the MPL was not distributed with this
17527			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17528
17529			namespace parsito {
17530
17531	1		transition_system_projective::transition_system_projective(const vector& labels) : transition_system(labels) {
17532	1	50	transitions.emplace_back(new transition_shift());
		50
17533	7	100	for (auto&& label : labels) {
17534	6	50	transitions.emplace_back(new transition_left_arc(label));
		50
		50
17535	6	50	transitions.emplace_back(new transition_right_arc(label));
		50
		50
17536			}
17537	1		}
17538
17539			// Static oracle
17540	0		class transition_system_projective_oracle_static : public transition_oracle {
17541			public:
17542	0		transition_system_projective_oracle_static(const vector& labels) : labels(labels) {
17543	0	0	for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
		0
17544	0		}
17545
17546	0		class tree_oracle_static : public transition_oracle::tree_oracle {
17547			public:
17548	0		tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {}
17549			virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17550			virtual void interesting_transitions(const configuration& conf, vector& transitions) const override;
17551			private:
17552			const vector& labels;
17553			unsigned root_label;
17554			const tree& gold;
17555			};
17556
17557			virtual unique_ptr create_tree_oracle(const tree& gold) const override;
17558			private:
17559			const vector& labels;
17560			unsigned root_label;
17561			};
17562
17563	0		unique_ptr transition_system_projective_oracle_static::create_tree_oracle(const tree& gold) const {
17564	0		return unique_ptr(new tree_oracle_static(labels, root_label, gold));
17565			}
17566
17567	0		void transition_system_projective_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const {
17568			transitions.clear();
17569	0	0	if (!conf.buffer.empty()) transitions.push_back(0);
17570	0	0	if (conf.stack.size() >= 2)
17571	0	0	for (int direction = 0; direction < 2; direction++) {
17572	0		int child = conf.stack[conf.stack.size() - 2 + direction];
17573	0	0	for (size_t i = 0; i < labels.size(); i++)
17574	0	0	if (gold.nodes[child].deprel == labels[i])
17575	0	0	if (!conf.single_root \|\|
		0
17576	0	0	(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) \|\|
		0
		0
		0
		0
17577	0	0	(i != root_label && conf.stack.size() > 2))
17578	0		transitions.push_back(1 + 2*i + direction);
17579			}
17580	0		}
17581
17582	0		transition_oracle::predicted_transition transition_system_projective_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /network_outcome/, unsigned /iteration/) const {
17583			// Use left if appropriate
17584	0	0	if (conf.stack.size() >= 2) {
17585	0		int parent = conf.stack[conf.stack.size() - 1];
17586	0		int child = conf.stack[conf.stack.size() - 2];
17587	0	0	if (gold.nodes[child].head == parent) {
17588	0	0	for (size_t i = 0; i < labels.size(); i++)
17589	0	0	if (gold.nodes[child].deprel == labels[i])
17590	0		return predicted_transition(1 + 2i, 1 + 2i);
17591
17592	0		assert(!"label was not found");
17593			}
17594			}
17595
17596			// Use right if appropriate
17597	0	0	if (conf.stack.size() >= 2) {
17598	0		int child = conf.stack[conf.stack.size() - 1];
17599	0		int parent = conf.stack[conf.stack.size() - 2];
17600	0	0	if (gold.nodes[child].head == parent &&
		0
		0
17601	0	0	(conf.buffer.empty() \|\| gold.nodes[child].children.empty() \|\| gold.nodes[child].children.back() < conf.buffer.back())) {
		0
17602	0	0	for (size_t i = 0; i < labels.size(); i++)
17603	0	0	if (gold.nodes[child].deprel == labels[i])
17604	0		return predicted_transition(1 + 2i + 1, 1 + 2i + 1);
17605
17606	0		assert(!"label was not found");
17607			}
17608			}
17609
17610			// Otherwise, just shift
17611	0		return predicted_transition(0, 0);
17612			}
17613
17614			// Dynamic oracle
17615	0		class transition_system_projective_oracle_dynamic : public transition_oracle {
17616			public:
17617	0		transition_system_projective_oracle_dynamic(const vector& labels) : labels(labels) {
17618	0	0	for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
		0
17619	0		}
17620
17621	0		class tree_oracle_dynamic : public transition_oracle::tree_oracle {
17622			public:
17623	0		tree_oracle_dynamic(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), gold(gold), oracle_static(labels, root_label, gold) {}
17624			virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17625			virtual void interesting_transitions(const configuration& conf, vector& transitions) const override;
17626			private:
17627			const vector& labels;
17628			const tree& gold;
17629			transition_system_projective_oracle_static::tree_oracle_static oracle_static;
17630			};
17631
17632			virtual unique_ptr create_tree_oracle(const tree& gold) const override;
17633			private:
17634			const vector& labels;
17635			unsigned root_label;
17636			};
17637
17638	0		unique_ptr transition_system_projective_oracle_dynamic::create_tree_oracle(const tree& gold) const {
17639	0		return unique_ptr(new tree_oracle_dynamic(labels, root_label, gold));
17640			}
17641
17642	0		void transition_system_projective_oracle_dynamic::tree_oracle_dynamic::interesting_transitions(const configuration& conf, vector& transitions) const {
17643	0		oracle_static.interesting_transitions(conf, transitions);
17644	0		}
17645
17646	0		transition_oracle::predicted_transition transition_system_projective_oracle_dynamic::tree_oracle_dynamic::predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const {
17647			// Use static oracle in the first iteration
17648	0	0	if (iteration <= 1)
17649	0		return oracle_static.predict(conf, network_outcome, iteration);
17650
17651			// Use dynamic programming to compute transition leading to best parse tree
17652
17653			// Start by computing the right stack
17654			vector right_stack;
17655
17656			unordered_set right_stack_inserted;
17657	0	0	if (!conf.buffer.empty()) {
17658	0		int buffer_start = conf.buffer.back();
17659	0	0	for (size_t i = conf.buffer.size(); i--; ) {
17660			const auto& node = conf.buffer[i];
17661	0		bool to_right_stack = gold.nodes[node].head < buffer_start;
17662	0	0	for (auto&& child : gold.nodes[node].children)
17663	0		to_right_stack \|= child < buffer_start \|\| right_stack_inserted.count(child);
17664	0	0	if (to_right_stack) {
17665	0	0	right_stack.push_back(node);
17666			right_stack_inserted.insert(node);
17667			}
17668			}
17669			}
17670
17671			// Fill the array T from the 2014 Goldberg paper
17672	0	0	class t_representation {
		0
		0
		0
17673			public:
17674	0		t_representation(const vector& stack, const vector& right_stack, const tree& gold, const vector& labels)
17675	0	0	: stack(stack), right_stack(right_stack), gold(gold), labels(labels) {
		0
17676	0	0	for (int i = 0; i < 2; i++) {
17677	0	0	costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size()));
17678	0	0	transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size()));
17679			}
17680	0	0	}
		0
17681
17682	0		void prepare(unsigned diagonal) {
17683	0		costs[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), gold.nodes.size() + 1);
17684	0		transitions[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), -1);
17685	0		}
17686
17687	0		int& cost(unsigned i, unsigned j, unsigned h) { return costs[(i+j) & 1][i * (i+j+1) + h]; }
17688	0		int& transition(unsigned i, unsigned j, unsigned h) { return transitions[(i+j) & 1][i * (i+j+1) + h]; }
17689
17690	0	0	int node(unsigned i, unsigned /j/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; }
		0
		0
17691	0		int edge_cost(int parent, int child) const { return gold.nodes[child].head != parent; }
17692	0		int which_arc_transition(int parent, int child) const {
17693	0	0	for (size_t i = 0; i < labels.size(); i++)
17694	0	0	if (gold.nodes[child].deprel == labels[i])
17695	0		return 1 + 2*i + (child > parent);
17696	0		assert(!"label was not found");
17697			return 0; // To keep VS 2015 happy and warning-free
17698			}
17699
17700			private:
17701			const vector& stack;
17702			const vector& right_stack;
17703			const tree& gold;
17704			const vector& labels;
17705			vector costs[2], transitions[2];
17706	0	0	} t(conf.stack, right_stack, gold, labels);
17707
17708	0	0	t.prepare(0);
17709	0		t.cost(0, 0, 0) = 0;
17710	0	0	for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) {
17711	0	0	t.prepare(diagonal + 1);
17712	0	0	for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) {
		0
		0
		0
17713	0		unsigned j = diagonal - i;
17714
17715			// Try extending stack
17716	0	0	if (i+1 < conf.stack.size())
17717	0	0	for (unsigned h = 0; h <= diagonal; h++) {
17718			int h_node = t.node(i, j, h), new_node = t.node(i+1, j, 0);
17719	0	0	if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) {
		0
		0
17720	0		t.cost(i+1, j, h+1) = t.cost(i, j, h) + t.edge_cost(h_node, new_node);
17721	0	0	t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node);
17722			}
17723	0	0	if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) {
17724	0		t.cost(i+1, j, 0) = t.cost(i, j, h) + t.edge_cost(new_node, h_node);
17725	0	0	t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node);
17726			}
17727			}
17728
17729			// Try extending right_stack
17730	0	0	if (j+1 < right_stack.size() + 1)
17731	0	0	for (unsigned h = 0; h <= diagonal; h++) {
17732			int h_node = t.node(i, j, h), new_node = t.node(i, j+1, diagonal+1);
17733	0	0	if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) {
17734	0		t.cost(i, j+1, h) = t.cost(i, j, h) + t.edge_cost(h_node, new_node);
17735	0	0	t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0;
17736			}
17737	0	0	if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) {
		0
		0
17738	0		t.cost(i, j+1, diagonal+1) = t.cost(i, j, h) + t.edge_cost(new_node, h_node);
17739	0	0	t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0;
17740			}
17741			}
17742			}
17743			}
17744
17745	0		return predicted_transition(t.transition(conf.stack.size() - 1, right_stack.size(), 0), network_outcome);
17746			}
17747
17748			// Oracle factory method
17749	0		transition_oracle* transition_system_projective::oracle(const string& name) const {
17750	0	0	if (name == "static") return new transition_system_projective_oracle_static(labels);
		0
17751	0	0	if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels);
		0
17752			return nullptr;
17753			}
17754
17755			} // namespace parsito
17756
17757			/////////
17758			// File: parsito/transition/transition_system_swap.cpp
17759			/////////
17760
17761			// This file is part of Parsito .
17762			//
17763			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17764			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17765			//
17766			// This Source Code Form is subject to the terms of the Mozilla Public
17767			// License, v. 2.0. If a copy of the MPL was not distributed with this
17768			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17769
17770			namespace parsito {
17771
17772	0		transition_system_swap::transition_system_swap(const vector& labels) : transition_system(labels) {
17773	0	0	transitions.emplace_back(new transition_shift());
		0
17774	0	0	transitions.emplace_back(new transition_swap());
		0
17775	0	0	for (auto&& label : labels) {
17776	0	0	transitions.emplace_back(new transition_left_arc(label));
		0
		0
17777	0	0	transitions.emplace_back(new transition_right_arc(label));
		0
		0
17778			}
17779	0		}
17780
17781			// Static oracle
17782	0		class transition_system_swap_oracle_static : public transition_oracle {
17783			public:
17784	0		transition_system_swap_oracle_static(const vector& labels, bool lazy) : labels(labels), lazy(lazy) {
17785	0	0	for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break;
		0
17786	0		}
17787
17788	0		class tree_oracle_static : public transition_oracle::tree_oracle {
17789			public:
17790	0		tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold, vector&& projective_order, vector&& projective_components)
17791	0	0	: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {}
		0
17792			virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override;
17793			virtual void interesting_transitions(const configuration& conf, vector& transitions) const override;
17794			private:
17795			const vector& labels;
17796			unsigned root_label;
17797			const tree& gold;
17798			const vector projective_order;
17799			const vector projective_components;
17800			};
17801
17802			virtual unique_ptr create_tree_oracle(const tree& gold) const override;
17803			private:
17804			void create_projective_order(const tree& gold, int node, vector& projective_order, int& projective_index) const;
17805			void create_projective_component(const tree& gold, int node, vector& projective_components, int component_index) const;
17806
17807			const vector& labels;
17808			bool lazy;
17809			unsigned root_label;
17810			};
17811
17812	0		unique_ptr transition_system_swap_oracle_static::create_tree_oracle(const tree& gold) const {
17813	0		vector projective_order(gold.nodes.size());
17814			int projective_index;
17815	0		create_projective_order(gold, 0, projective_order, projective_index);
17816
17817			vector projective_components;
17818	0	0	if (lazy) {
17819	0	0	tree_oracle_static projective_oracle(labels, root_label, gold, vector(), vector());
17820	0		configuration conf(false);
17821			tree t = gold;
17822	0	0	transition_system_swap system(labels);
17823
17824	0	0	conf.init(&t);
17825	0	0	while (!conf.final()) {
17826	0		auto prediction = projective_oracle.predict(conf, 0, 0);
17827	0	0	if (!system.applicable(conf, prediction.to_follow)) break;
		0
17828	0	0	system.perform(conf, prediction.to_follow);
17829			}
17830
17831	0		projective_components.assign(gold.nodes.size(), 0);
17832	0	0	for (auto&& node : conf.stack)
17833	0	0	if (node)
17834	0		create_projective_component(t, node, projective_components, node);
17835			}
17836
17837	0	0	return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components)));
		0
17838			}
17839
17840	0		void transition_system_swap_oracle_static::create_projective_order(const tree& gold, int node, vector& projective_order, int& projective_index) const {
17841			unsigned child_index = 0;
17842	0	0	while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node)
		0
		0
17843	0		create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index);
17844	0		projective_order[node] = projective_index++;
17845	0	0	while (child_index < gold.nodes[node].children.size())
17846	0		create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index);
17847	0		}
17848
17849	0		void transition_system_swap_oracle_static::create_projective_component(const tree& gold, int node, vector& projective_components, int component_index) const {
17850	0		projective_components[node] = component_index;
17851	0	0	for (auto&& child : gold.nodes[node].children)
17852	0		create_projective_component(gold, child, projective_components, component_index);
17853	0		}
17854
17855	0		void transition_system_swap_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const {
17856			transitions.clear();
17857	0	0	if (!conf.buffer.empty()) transitions.push_back(0);
17858	0	0	if (conf.stack.size() >= 2) {
17859			// Swap
17860	0	0	if (!projective_order.empty()) {
17861	0		int last = conf.stack[conf.stack.size() - 1];
17862	0		int prev = conf.stack[conf.stack.size() - 2];
17863	0	0	if (projective_order[last] < projective_order[prev] &&
		0
		0
17864	0	0	(projective_components.empty() \|\|
17865	0	0	(conf.buffer.empty() \|\| projective_components[last] != projective_components[conf.buffer.back()])))
17866	0		transitions.push_back(1);
17867			}
17868
17869			// Arcs
17870	0	0	for (int direction = 0; direction < 2; direction++) {
17871	0		int child = conf.stack[conf.stack.size() - 2 + direction];
17872	0	0	for (size_t i = 0; i < labels.size(); i++)
17873	0	0	if (gold.nodes[child].deprel == labels[i])
17874	0	0	if (!conf.single_root \|\|
		0
17875	0	0	(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) \|\|
		0
		0
		0
		0
17876	0	0	(i != root_label && conf.stack.size() > 2))
17877	0		transitions.push_back(2 + 2*i + direction);
17878			}
17879			}
17880	0		}
17881
17882	0		transition_oracle::predicted_transition transition_system_swap_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /network_outcome/, unsigned /iteration/) const {
17883			// Use left if appropriate
17884	0	0	if (conf.stack.size() >= 2) {
17885	0		int parent = conf.stack[conf.stack.size() - 1];
17886	0		int child = conf.stack[conf.stack.size() - 2];
17887	0	0	if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
		0
		0
17888	0	0	for (size_t i = 0; i < labels.size(); i++)
17889	0	0	if (gold.nodes[child].deprel == labels[i])
17890	0		return predicted_transition(2 + 2i, 2 + 2i);
17891
17892	0		assert(!"label was not found");
17893			}
17894			}
17895
17896			// Use right if appropriate
17897	0	0	if (conf.stack.size() >= 2) {
17898	0		int child = conf.stack[conf.stack.size() - 1];
17899	0		int parent = conf.stack[conf.stack.size() - 2];
17900	0	0	if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) {
		0
		0
17901	0	0	for (size_t i = 0; i < labels.size(); i++)
17902	0	0	if (gold.nodes[child].deprel == labels[i])
17903	0		return predicted_transition(2 + 2i + 1, 2 + 2i + 1);
17904
17905	0		assert(!"label was not found");
17906			}
17907			}
17908
17909			// Use swap if appropriate
17910	0	0	if (conf.stack.size() >= 2 && !projective_order.empty()) {
		0
		0
17911	0		int last = conf.stack[conf.stack.size() - 1];
17912	0		int prev = conf.stack[conf.stack.size() - 2];
17913	0	0	if (projective_order[last] < projective_order[prev] &&
		0
		0
17914	0	0	(projective_components.empty() \|\|
17915	0	0	(conf.buffer.empty() \|\| projective_components[last] != projective_components[conf.buffer.back()])))
17916	0		return predicted_transition(1, 1);
17917			}
17918
17919			// Otherwise, just shift
17920	0		return predicted_transition(0, 0);
17921			}
17922
17923			// Oracle factory method
17924	0		transition_oracle* transition_system_swap::oracle(const string& name) const {
17925	0	0	if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false);
		0
17926	0	0	if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true);
		0
17927			return nullptr;
17928			}
17929
17930			} // namespace parsito
17931
17932			/////////
17933			// File: parsito/tree/tree.cpp
17934			/////////
17935
17936			// This file is part of Parsito .
17937			//
17938			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
17939			// Mathematics and Physics, Charles University in Prague, Czech Republic.
17940			//
17941			// This Source Code Form is subject to the terms of the Mozilla Public
17942			// License, v. 2.0. If a copy of the MPL was not distributed with this
17943			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
17944
17945			namespace parsito {
17946
17947	2		const string tree::root_form = "";
17948
17949	1		tree::tree() {
17950	1	50	clear();
17951	1		}
17952
17953	0		bool tree::empty() {
17954	0		return nodes.size() == 1;
17955			}
17956
17957	2		void tree::clear() {
17958			nodes.clear();
17959			node& root = add_node(root_form);
17960	8		root.lemma = root.upostag = root.xpostag = root.feats = root_form;
17961	2		}
17962
17963	0		node& tree::add_node(const string& form) {
17964	9	0	nodes.emplace_back((int)nodes.size(), form);
		0
		0
		50
17965	0		return nodes.back();
17966			}
17967
17968	38		void tree::set_head(int id, int head, const string& deprel) {
17969	38	50	assert(id >= 0 && id < int(nodes.size()));
		50
17970	38	50	assert(head < int(nodes.size()));
17971
17972			// Remove existing head
17973	38	50	if (nodes[id].head >= 0) {
17974	0		auto& children = nodes[nodes[id].head].children;
17975	0	0	for (size_t i = children.size(); i && children[i-1] >= id; i--)
		0
		0
17976	0	0	if (children[i-1] == id) {
17977			children.erase(children.begin() + i - 1);
17978	0		break;
17979			}
17980			}
17981
17982			// Set new head
17983	76		nodes[id].head = head;
17984	38		nodes[id].deprel = deprel;
17985	38	50	if (head >= 0) {
17986	76		auto& children = nodes[head].children;
17987			size_t i = children.size();
17988	56	100	while (i && children[i-1] > id) i--;
		100
		100
17989	38	100	if (!i \|\| children[i-1] < id) children.insert(children.begin() + i, id);
		50
		50
17990			}
17991	38		}
17992
17993	0		void tree::unlink_all_nodes() {
17994	9	0	for (auto&& node : nodes) {
		0
		0
		0
		100
17995	8		node.head = -1;
17996			node.deprel.clear();
17997			node.children.clear();
17998			}
17999	0		}
18000
18001			} // namespace parsito
18002
18003			/////////
18004			// File: parsito/tree/tree_format.h
18005			/////////
18006
18007			// This file is part of Parsito .
18008			//
18009			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18010			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18011			//
18012			// This Source Code Form is subject to the terms of the Mozilla Public
18013			// License, v. 2.0. If a copy of the MPL was not distributed with this
18014			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18015
18016			namespace parsito {
18017
18018			// Input format
18019	0		class tree_input_format {
18020			public:
18021	0		virtual ~tree_input_format() {}
18022
18023			virtual bool read_block(istream& in, string& block) const = 0;
18024			virtual void set_text(string_piece text, bool make_copy = false) = 0;
18025			virtual bool next_tree(tree& t) = 0;
18026			const string& last_error() const;
18027
18028			// Static factory methods
18029			static tree_input_format* new_input_format(const string& name);
18030			static tree_input_format* new_conllu_input_format();
18031
18032			protected:
18033			string error;
18034			};
18035
18036			// Output format
18037	0		class tree_output_format {
18038			public:
18039	0		virtual ~tree_output_format() {}
18040
18041			virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const = 0;
18042
18043			// Static factory methods
18044			static tree_output_format* new_output_format(const string& name);
18045			static tree_output_format* new_conllu_output_format();
18046			};
18047
18048			} // namespace parsito
18049
18050			/////////
18051			// File: parsito/tree/tree_format_conllu.h
18052			/////////
18053
18054			// This file is part of Parsito .
18055			//
18056			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18057			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18058			//
18059			// This Source Code Form is subject to the terms of the Mozilla Public
18060			// License, v. 2.0. If a copy of the MPL was not distributed with this
18061			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18062
18063			namespace parsito {
18064
18065			// Input CoNLL-U format
18066	0		class tree_input_format_conllu : public tree_input_format {
18067			public:
18068			virtual bool read_block(istream& in, string& block) const override;
18069			virtual void set_text(string_piece text, bool make_copy = false) override;
18070			virtual bool next_tree(tree& t) override;
18071
18072			private:
18073			friend class tree_output_format_conllu;
18074			vector comments;
18075			vector> multiword_tokens;
18076
18077			string_piece text;
18078			string text_copy;
18079			};
18080
18081			// Output CoNLL-U format
18082	0		class tree_output_format_conllu : public tree_output_format {
18083			public:
18084			virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const override;
18085
18086			private:
18087			static const string underscore;
18088	0	0	const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
		0
		0
		0
		0
		0
		0
18089			};
18090
18091			} // namespace parsito
18092
18093			/////////
18094			// File: parsito/tree/tree_format.cpp
18095			/////////
18096
18097			// This file is part of Parsito .
18098			//
18099			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18100			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18101			//
18102			// This Source Code Form is subject to the terms of the Mozilla Public
18103			// License, v. 2.0. If a copy of the MPL was not distributed with this
18104			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18105
18106			namespace parsito {
18107
18108	0		const string& tree_input_format::last_error() const {
18109	0		return error;
18110			}
18111
18112			// Input Static factory methods
18113	0		tree_input_format* tree_input_format::new_conllu_input_format() {
18114	0		return new tree_input_format_conllu();
18115			}
18116
18117	0		tree_input_format* tree_input_format::new_input_format(const string& name) {
18118	0	0	if (name == "conllu") return new_conllu_input_format();
18119			return nullptr;
18120			}
18121
18122			// Output static factory methods
18123	0		tree_output_format* tree_output_format::new_conllu_output_format() {
18124	0		return new tree_output_format_conllu();
18125			}
18126
18127	0		tree_output_format* tree_output_format::new_output_format(const string& name) {
18128	0	0	if (name == "conllu") return new_conllu_output_format();
18129			return nullptr;
18130			}
18131
18132			} // namespace parsito
18133
18134			/////////
18135			// File: parsito/tree/tree_format_conllu.cpp
18136			/////////
18137
18138			// This file is part of Parsito .
18139			//
18140			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18141			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18142			//
18143			// This Source Code Form is subject to the terms of the Mozilla Public
18144			// License, v. 2.0. If a copy of the MPL was not distributed with this
18145			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18146
18147			namespace parsito {
18148
18149			// Input CoNLL-U format
18150
18151	0		bool tree_input_format_conllu::read_block(istream& in, string& block) const {
18152	0		return bool(getpara(in, block));
18153			}
18154
18155	0		void tree_input_format_conllu::set_text(string_piece text, bool make_copy) {
18156	0	0	if (make_copy) {
18157	0		text_copy.assign(text.str, text.len);
18158			text = string_piece(text_copy.c_str(), text_copy.size());
18159			}
18160	0		this->text = text;
18161	0		}
18162
18163	0		bool tree_input_format_conllu::next_tree(tree& t) {
18164			error.clear();
18165	0		t.clear();
18166			comments.clear();
18167			multiword_tokens.clear();
18168			int last_multiword_token = 0;
18169
18170			vector tokens, parts;
18171	0	0	while (text.len) {
18172			// Read line
18173	0		string_piece line(text.str, 0);
18174	0	0	while (line.len < text.len && line.str[line.len] != '\n') line.len++;
		0
18175	0		text.str += line.len + (line.len < text.len);
18176	0		text.len -= line.len + (line.len < text.len);
18177
18178			// Empty lines denote end of tree, unless at the beginning
18179	0	0	if (!line.len) {
18180	0	0	if (t.empty()) continue;
18181	0		break;
18182			}
18183
18184	0	0	if (*line.str == '#') {
18185			// Store comments at the beginning and ignore the rest
18186	0	0	if (t.empty()) comments.push_back(line);
		0
18187			continue;
18188			}
18189
18190			// Parse another tree node
18191	0	0	split(line, '\t', tokens);
18192	0	0	if (tokens.size() != 10)
18193	0	0	return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
		0
		0
18194
18195			// Store and skip multiword tokens
18196	0	0	if (memchr(tokens[0].str, '-', tokens[0].len)) {
18197	0	0	split(tokens[0], '-', parts);
18198	0	0	if (parts.size() != 2)
18199	0	0	return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
		0
		0
18200			int from, to;
18201	0	0	if (!parse_int(parts[0], "CoNLL-U id", from, error) \|\| !parse_int(parts[1], "CoNLL-U id", to, error))
		0
		0
		0
		0
18202			return false;
18203	0	0	if (from != int(t.nodes.size()))
18204	0	0	return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18205	0	0	if (to < from)
18206	0	0	return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18207	0	0	if (from <= last_multiword_token)
18208	0	0	return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
		0
		0
18209			last_multiword_token = to;
18210	0	0	multiword_tokens.emplace_back(from, line);
18211	0		continue;
18212			}
18213
18214			// Parse node ID and head
18215			int id;
18216	0	0	if (!parse_int(tokens[0], "CoNLL-U id", id, error))
		0
18217			return false;
18218	0	0	if (id != int(t.nodes.size()))
18219	0	0	return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18220
18221			int head;
18222	0	0	if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
		0
		0
18223	0		head = -1;
18224			} else {
18225	0	0	if (!parse_int(tokens[6], "CoNLL-U head", head, error))
		0
18226			return false;
18227	0	0	if (head < 0)
18228	0	0	return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
		0
		0
18229			}
18230
18231			// Add new node
18232	0		auto& node = t.add_node(string(tokens[1].str, tokens[1].len));
18233	0	0	if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len);
		0
		0
18234	0	0	if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len);
		0
		0
18235	0	0	if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len);
		0
		0
18236	0	0	if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len);
		0
		0
18237	0		node.head = head;
18238	0	0	if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len);
		0
		0
18239	0	0	if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len);
		0
		0
18240	0	0	if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len);
		0
		0
18241			}
18242
18243			// Check that we got word for the last multiword token
18244	0	0	if (last_multiword_token >= int(t.nodes.size()))
18245	0	0	return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false;
		0
		0
18246
18247			// Set heads correctly
18248	0	0	for (auto&& node : t.nodes)
18249	0	0	if (node.id && node.head >= 0) {
		0
18250	0	0	if (node.head >= int(t.nodes.size()))
18251	0	0	return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false;
		0
		0
		0
18252	0	0	t.set_head(node.id, node.head, node.deprel);
18253			}
18254
18255	0		return !t.empty();
18256			}
18257
18258			// Output CoNLL-U format
18259
18260	2		const string tree_output_format_conllu::underscore = "_";
18261
18262	0		void tree_output_format_conllu::write_tree(const tree& t, string& output, const tree_input_format* additional_info) const {
18263			output.clear();
18264
18265			// Try casting input format to CoNLL-U
18266	0	0	auto input_conllu = dynamic_cast(additional_info);
18267			size_t input_conllu_multiword_tokens = 0;
18268
18269			// Comments if present
18270	0	0	if (input_conllu)
18271	0	0	for (auto&& comment : input_conllu->comments)
18272	0		output.append(comment.str, comment.len).push_back('\n');
18273
18274			// Print out the tokens
18275	0	0	for (int i = 1 /skip the root node/; i < int(t.nodes.size()); i++) {
18276			// Write multiword token if present
18277	0	0	if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() &&
		0
		0
		0
18278	0		i == input_conllu->multiword_tokens[input_conllu_multiword_tokens].first) {
18279	0		output.append(input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.str,
18280	0		input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.len).push_back('\n');
18281	0		input_conllu_multiword_tokens++;
18282			}
18283
18284			// Write the token
18285	0	0	output.append(to_string(i)).push_back('\t');
18286	0		output.append(t.nodes[i].form).push_back('\t');
18287	0		output.append(underscore_on_empty(t.nodes[i].lemma)).push_back('\t');
18288	0		output.append(underscore_on_empty(t.nodes[i].upostag)).push_back('\t');
18289	0		output.append(underscore_on_empty(t.nodes[i].xpostag)).push_back('\t');
18290	0		output.append(underscore_on_empty(t.nodes[i].feats)).push_back('\t');
18291	0	0	output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t');
		0
18292	0		output.append(underscore_on_empty(t.nodes[i].deprel)).push_back('\t');
18293	0		output.append(underscore_on_empty(t.nodes[i].deps)).push_back('\t');
18294	0		output.append(underscore_on_empty(t.nodes[i].misc)).push_back('\n');
18295			}
18296	0		output.push_back('\n');
18297	0		}
18298
18299			} // namespace parsito
18300
18301			/////////
18302			// File: parsito/version/version.h
18303			/////////
18304
18305			// This file is part of Parsito .
18306			//
18307			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18308			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18309			//
18310			// This Source Code Form is subject to the terms of the Mozilla Public
18311			// License, v. 2.0. If a copy of the MPL was not distributed with this
18312			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18313
18314			namespace parsito {
18315
18316	0		struct version {
18317			unsigned major;
18318			unsigned minor;
18319			unsigned patch;
18320			std::string prerelease;
18321
18322			// Returns current version.
18323			static version current();
18324
18325			// Returns multi-line formated version and copyright string.
18326			static string version_and_copyright(const string& other_libraries = string());
18327			};
18328
18329			} // namespace parsito
18330
18331			/////////
18332			// File: parsito/version/version.cpp
18333			/////////
18334
18335			// This file is part of Parsito .
18336			//
18337			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18338			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18339			//
18340			// This Source Code Form is subject to the terms of the Mozilla Public
18341			// License, v. 2.0. If a copy of the MPL was not distributed with this
18342			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18343
18344			namespace parsito {
18345
18346			// Returns current version.
18347	0		version version::current() {
18348	0	0	return {1, 1, 1, "devel"};
		0
18349			}
18350
18351			// Returns multi-line formated version and copyright string.
18352	0		string version::version_and_copyright(const string& other_libraries) {
18353	0		ostringstream info;
18354
18355			auto parsito = version::current();
18356			auto unilib = unilib::version::current();
18357
18358	0		info << "Parsito version " << parsito.major << '.' << parsito.minor << '.' << parsito.patch
18359	0	0	<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease
		0
18360	0		<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch
18361	0	0	<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n"
		0
18362			"Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n"
18363	0	0	"Mathematics and Physics, Charles University in Prague, Czech Republic.";
18364
18365	0		return info.str();
18366			}
18367
18368			} // namespace parsito
18369
18370			/////////
18371			// File: sentence/input_format.cpp
18372			/////////
18373
18374			// This file is part of UDPipe .
18375			//
18376			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18377			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18378			//
18379			// This Source Code Form is subject to the terms of the Mozilla Public
18380			// License, v. 2.0. If a copy of the MPL was not distributed with this
18381			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18382
18383	2		const string input_format::CONLLU_V1 = "v1";
18384	2		const string input_format::CONLLU_V2 = "v2";
18385	2		const string input_format::GENERIC_TOKENIZER_NORMALIZED_SPACES = "normalized_spaces";
18386	2		const string input_format::GENERIC_TOKENIZER_PRESEGMENTED = "presegmented";
18387	2		const string input_format::GENERIC_TOKENIZER_RANGES = "ranges";
18388
18389			// CoNLL-U input format
18390	0		class input_format_conllu : public input_format {
18391			public:
18392	0		input_format_conllu(unsigned version) : version(version) {}
18393
18394			virtual bool read_block(istream& is, string& block) const override;
18395			virtual void reset_document(string_piece id = string_piece()) override;
18396			virtual void set_text(string_piece text, bool make_copy = false) override;
18397			virtual bool next_sentence(sentence& s, string& error) override;
18398
18399			private:
18400			unsigned version;
18401			string_piece text;
18402			string text_copy;
18403
18404			static const string columns[10];
18405			};
18406
18407	26	100	const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA",
18408	2	50	"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"};
		50
		50
		50
		50
		50
		50
		50
		50
		50
		0
18409
18410	0		bool input_format_conllu::read_block(istream& is, string& block) const {
18411	0		return bool(getpara(is, block));
18412			}
18413
18414	0		void input_format_conllu::reset_document(string_piece /id/) {
18415	0		set_text("");
18416	0		}
18417
18418	0		void input_format_conllu::set_text(string_piece text, bool make_copy) {
18419	0	0	if (make_copy) {
18420	0		text_copy.assign(text.str, text.len);
18421			text = string_piece(text_copy.c_str(), text_copy.size());
18422			}
18423	0		this->text = text;
18424	0		}
18425
18426	0		bool input_format_conllu::next_sentence(sentence& s, string& error) {
18427			error.clear();
18428	0		s.clear();
18429			int last_multiword_token = 0;
18430
18431			vector tokens, parts;
18432	0	0	while (text.len) {
18433			// Read line
18434	0		string_piece line(text.str, 0);
18435	0	0	while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++;
		0
18436
18437	0		text.str += line.len, text.len -= line.len;
18438	0	0	if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
		0
		0
18439	0		text.str += 2, text.len -= 2;
18440	0	0	else if (text.len && *text.str == '\n')
		0
18441	0		text.str++, text.len--;
18442
18443			// Empty lines denote end of tree, unless at the beginning
18444	0	0	if (!line.len) {
18445	0	0	if (s.empty()) continue;
18446	0		break;
18447			}
18448
18449	0	0	if (*line.str == '#') {
18450			// Store comments at the beginning and ignore the rest
18451	0	0	if (s.empty()) s.comments.emplace_back(line.str, line.len);
		0
18452			continue;
18453			}
18454
18455			// Parse the line
18456	0	0	split(line, '\t', tokens);
18457	0	0	if (tokens.size() != 10)
18458	0	0	return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false;
		0
		0
18459
18460			// Check that no column is empty and contains no spaces (except FORM, LEMMA and MISC in version >= 2)
18461	0	0	for (int i = 0; i < 10; i++) {
18462	0	0	if (!tokens[i].len)
18463	0	0	return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false;
		0
		0
		0
18464	0	0	if ((version < 2 \|\| (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL)
		0
		0
		0
		0
18465	0	0	return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false;
		0
		0
		0
18466			}
18467
18468			// Handle multiword tokens
18469	0	0	if (memchr(tokens[0].str, '-', tokens[0].len)) {
18470	0	0	split(tokens[0], '-', parts);
18471	0	0	if (parts.size() != 2)
18472	0	0	return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false;
		0
		0
18473			int from, to;
18474	0	0	if (!parse_int(parts[0], "CoNLL-U id", from, error) \|\| !parse_int(parts[1], "CoNLL-U id", to, error))
		0
		0
		0
		0
18475			return false;
18476	0	0	if (from != int(s.words.size()))
18477	0	0	return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18478	0	0	if (to < from)
18479	0	0	return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18480	0	0	if (from <= last_multiword_token)
18481	0	0	return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false;
		0
		0
18482			last_multiword_token = to;
18483	0	0	for (int i = 2; i < 9; i++)
18484	0	0	if (tokens[i].len != 1 \|\| tokens[i].str[0] != '_')
		0
		0
18485	0	0	return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false;
		0
		0
		0
18486	0	0	s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]);
		0
		0
18487	0		continue;
18488			}
18489
18490			// Handle empty nodes
18491	0	0	if (version >= 2)
18492	0	0	if (memchr(tokens[0].str, '.', tokens[0].len)) {
18493	0	0	split(tokens[0], '.', parts);
18494	0	0	if (parts.size() != 2)
18495	0	0	return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false;
		0
		0
18496			int id, index;
18497	0	0	if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) \|\| !parse_int(parts[1], "CoNLL-U empty node index", index, error))
		0
		0
		0
		0
18498			return false;
18499	0	0	if (id != int(s.words.size()) - 1)
18500	0	0	return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18501	0	0	if (!((s.empty_nodes.empty() && index == 1) \|\| (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) \|\|
		0
		0
		0
		0
18502	0	0	(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1)))
		0
		0
		0
18503	0	0	return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18504	0	0	for (int i = 6; i < 8; i++)
18505	0	0	if (tokens[i].len != 1 \|\| tokens[i].str[0] != '_')
		0
		0
18506	0	0	return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false;
		0
		0
		0
18507
18508	0	0	s.empty_nodes.emplace_back(id, index);
18509	0		s.empty_nodes.back().form.assign(tokens[1].str, tokens[1].len);
18510	0		s.empty_nodes.back().lemma.assign(tokens[2].str, tokens[2].len);
18511	0	0	if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len);
		0
		0
18512	0	0	if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len);
		0
		0
18513	0	0	if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len);
		0
		0
18514	0	0	if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len);
		0
		0
18515	0	0	if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len);
		0
		0
18516	0		continue;
18517			}
18518
18519			// Parse word ID and head
18520			int id;
18521	0	0	if (!parse_int(tokens[0], "CoNLL-U id", id, error))
		0
18522			return false;
18523	0	0	if (id != int(s.words.size()))
18524	0	0	return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false;
		0
		0
		0
		0
18525
18526			int head;
18527	0	0	if (tokens[6].len == 1 && tokens[6].str[0] == '_') {
		0
		0
18528	0		head = -1;
18529			} else {
18530	0	0	if (!parse_int(tokens[6], "CoNLL-U head", head, error))
		0
18531			return false;
18532	0	0	if (head < 0)
18533	0	0	return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false;
		0
		0
18534			}
18535
18536			// Add new word
18537			auto& word = s.add_word(tokens[1]);
18538	0		word.lemma.assign(tokens[2].str, tokens[2].len);
18539	0	0	if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len);
		0
		0
18540	0	0	if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len);
		0
		0
18541	0	0	if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len);
		0
		0
18542	0		word.head = head;
18543	0	0	if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len);
		0
		0
18544	0	0	if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len);
		0
		0
18545	0	0	if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len);
		0
		0
18546			}
18547
18548			// Check that we got word for the last multiword token
18549	0	0	if (last_multiword_token >= int(s.words.size()))
18550	0	0	return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false;
		0
18551
18552			// Set heads correctly
18553	0	0	for (auto&& word : s.words)
18554	0	0	if (word.id && word.head >= 0) {
		0
18555	0	0	if (word.head >= int(s.words.size()))
18556	0	0	return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false;
		0
		0
		0
18557	0	0	s.set_head(word.id, word.head, word.deprel);
18558			}
18559
18560	0		return !s.empty();
18561			}
18562
18563			// Horizontal input format
18564	0		class input_format_horizontal : public input_format {
18565			public:
18566			virtual bool read_block(istream& is, string& block) const override;
18567			virtual void reset_document(string_piece id = string_piece()) override;
18568			virtual void set_text(string_piece text, bool make_copy = false) override;
18569			virtual bool next_sentence(sentence& s, string& error) override;
18570
18571			private:
18572			string_piece text;
18573			string text_copy;
18574			bool new_document = true;
18575			string document_id;
18576			unsigned preceeding_newlines = 2;
18577			unsigned sentence_id = 1;
18578			};
18579
18580	0		bool input_format_horizontal::read_block(istream& is, string& block) const {
18581	0	0	if (getline(is, block))
18582	0		return block.push_back('\n'), true;
18583			return false;
18584			}
18585
18586	0		void input_format_horizontal::reset_document(string_piece id) {
18587	0		new_document = true;
18588	0		document_id.assign(id.str, id.len);
18589	0		preceeding_newlines = 2;
18590	0		sentence_id = 1;
18591	0		set_text("");
18592	0		}
18593
18594	0		void input_format_horizontal::set_text(string_piece text, bool make_copy) {
18595	0	0	if (make_copy) {
18596	0		text_copy.assign(text.str, text.len);
18597			text = string_piece(text_copy.c_str(), text_copy.size());
18598			}
18599	0		this->text = text;
18600	0		}
18601
18602	0		bool input_format_horizontal::next_sentence(sentence& s, string& error) {
18603			error.clear();
18604	0		s.clear();
18605
18606			// Skip spaces and newlines
18607	0	0	while (text.len && (text.str == ' ' \|\| text.str == '\t' \|\| text.str == '\r' \|\| text.str == '\n')) {
		0
		0
		0
18608	0		preceeding_newlines += *text.str == '\n';
18609	0		text.str++, text.len--;
18610			}
18611
18612			// Read space (and tab) separated words
18613	0	0	while (text.len && text.str != '\r' && text.str != '\n') {
		0
		0
18614			string_piece word = text;
18615
18616			// Slurp the word
18617	0	0	while (text.len && text.str != ' ' && text.str != '\t' && text.str != '\r' && text.str != '\n')
		0
		0
		0
		0
18618	0		text.str++, text.len--;
18619	0		word.len = text.str - word.str;
18620			s.add_word(word);
18621
18622			// Replace s by regular spaces
18623	0	0	if (s.words.back().form.find("\302\240") != string::npos) {
18624	0		string& form = s.words.back().form;
18625			size_t form_len = 0;
18626	0	0	for (size_t i = 0; i < form.size(); i++) {
18627	0	0	if (form_len && form[form_len-1] == '\302' && form[i] == '\240')
		0
		0
		0
18628	0		form[form_len - 1] = ' ';
18629			else
18630	0		form[form_len++] = form[i];
18631			}
18632			form.resize(form_len);
18633			}
18634
18635			// Skip spaces
18636	0	0	while (text.len && (text.str == ' ' \|\| text.str == '\t'))
		0
18637	0		text.str++, text.len--;
18638			}
18639
18640	0	0	if (!s.empty()) {
18641			// Mark new document if needed
18642	0	0	if (new_document)
18643	0		s.set_new_doc(true, document_id);
18644	0		new_document = false;
18645
18646			// Mark new paragraph if needed
18647	0	0	if (preceeding_newlines >= 2)
18648	0		s.set_new_par(true);
18649	0		preceeding_newlines = 0;
18650
18651			// Sentence id
18652	0	0	s.set_sent_id(to_string(sentence_id++));
18653			}
18654
18655	0		return !s.empty();
18656			}
18657
18658			// Vertical input format
18659	0		class input_format_vertical : public input_format {
18660			public:
18661			virtual bool read_block(istream& is, string& block) const override;
18662			virtual void reset_document(string_piece id = string_piece()) override;
18663			virtual void set_text(string_piece text, bool make_copy = false) override;
18664			virtual bool next_sentence(sentence& s, string& error) override;
18665
18666			private:
18667			string_piece text;
18668			string text_copy;
18669			bool new_document = true;
18670			string document_id;
18671			unsigned preceeding_newlines = 2;
18672			unsigned sentence_id = 1;
18673			};
18674
18675	0		bool input_format_vertical::read_block(istream& is, string& block) const {
18676	0		return bool(getpara(is, block));
18677			}
18678
18679	0		void input_format_vertical::reset_document(string_piece id) {
18680	0		new_document = true;
18681	0		document_id.assign(id.str, id.len);
18682	0		preceeding_newlines = 2;
18683	0		sentence_id = 1;
18684	0		set_text("");
18685	0		}
18686
18687	0		void input_format_vertical::set_text(string_piece text, bool make_copy) {
18688	0	0	if (make_copy) {
18689	0		text_copy.assign(text.str, text.len);
18690			text = string_piece(text_copy.c_str(), text_copy.size());
18691			}
18692	0		this->text = text;
18693	0		}
18694
18695	0		bool input_format_vertical::next_sentence(sentence& s, string& error) {
18696			error.clear();
18697	0		s.clear();
18698
18699			// Skip tabs and newlines
18700	0	0	while (text.len && (text.str == '\t' \|\| text.str == '\r' \|\| *text.str == '\n')) {
		0
		0
18701	0		preceeding_newlines += *text.str == '\n';
18702	0		text.str++, text.len--;
18703			}
18704
18705			// Read first word without tabs on every line
18706	0	0	while (text.len && text.str != '\r' && text.str != '\n') {
		0
		0
18707			string_piece word = text;
18708
18709			// Slurp the word
18710	0	0	while (text.len && text.str != '\t' && text.str != '\r' && *text.str != '\n')
		0
		0
		0
18711	0		text.str++, text.len--;
18712	0		word.len = text.str - word.str;
18713			s.add_word(word);
18714
18715			// Skip spaces till end of line
18716	0	0	while (text.len && text.str != '\r' && text.str != '\n')
		0
		0
18717	0		text.str++, text.len--;
18718
18719			// Skip one new line
18720	0	0	if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n')
		0
		0
18721	0		text.str += 2, text.len -= 2;
18722	0	0	else if (text.len && *text.str == '\n')
		0
18723	0		text.str++, text.len--;
18724
18725			// Skip tabs on the beginning of the line
18726	0	0	while (text.len && *text.str == '\t')
		0
18727	0		text.str++, text.len--;
18728			}
18729
18730	0	0	if (!s.empty()) {
18731			// Mark new document if needed
18732	0	0	if (new_document)
18733	0		s.set_new_doc(true, document_id);
18734	0		new_document = false;
18735
18736			// Mark new paragraph if needed
18737	0	0	if (preceeding_newlines >= 2)
18738	0		s.set_new_par(true);
18739	0		preceeding_newlines = 0;
18740
18741			// Sentence id
18742	0	0	s.set_sent_id(to_string(sentence_id++));
18743			}
18744
18745	0		return !s.empty();
18746			}
18747
18748			// Presegmented tokenizer
18749	0		class input_format_presegmented_tokenizer : public input_format {
18750			public:
18751	0		input_format_presegmented_tokenizer(input_format* tokenizer) : tokenizer(tokenizer) {}
18752
18753			virtual bool read_block(istream& is, string& block) const override;
18754			virtual void reset_document(string_piece id) override;
18755			virtual void set_text(string_piece text, bool make_copy = false) override;
18756			virtual bool next_sentence(sentence& s, string& error) override;
18757
18758			private:
18759			unique_ptr tokenizer;
18760			string_piece text;
18761			string text_copy;
18762			bool new_document = true;
18763			string document_id;
18764			unsigned preceeding_newlines = 2;
18765			unsigned sentence_id = 1;
18766			};
18767
18768	0		bool input_format_presegmented_tokenizer::read_block(istream& is, string& block) const {
18769	0	0	if (getline(is, block))
18770	0		return block.push_back('\n'), true;
18771			return false;
18772			}
18773
18774	0		void input_format_presegmented_tokenizer::reset_document(string_piece id) {
18775	0		new_document = true;
18776	0		document_id.assign(id.str, id.len);
18777	0		preceeding_newlines = 2;
18778	0		sentence_id = 1;
18779	0		tokenizer->reset_document();
18780	0		set_text("");
18781	0		}
18782
18783	0		void input_format_presegmented_tokenizer::set_text(string_piece text, bool make_copy) {
18784	0	0	if (make_copy) {
18785	0		text_copy.assign(text.str, text.len);
18786			text = string_piece(text_copy.c_str(), text_copy.size());
18787			}
18788	0		this->text = text;
18789	0		}
18790
18791	0		bool input_format_presegmented_tokenizer::next_sentence(sentence& s, string& error) {
18792			error.clear();
18793	0		s.clear();
18794
18795	0		sentence partial;
18796			unsigned following_newlines = 0;
18797	0	0	while (text.len && s.empty()) {
		0
		0
18798			// Move next line from `text' to `line', including leading and following newlines
18799	0		string_piece line(text.str, 0);
18800	0	0	while (line.len < text.len && (line.str[line.len] == '\n' \|\| line.str[line.len] == '\r')) {
		0
18801	0		preceeding_newlines += line.str[line.len] == '\n';
18802	0		line.len++;
18803			}
18804	0	0	while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r'))
		0
18805	0		line.len++;
18806	0	0	while (line.len < text.len && (line.str[line.len] == '\n' \|\| line.str[line.len] == '\r')) {
		0
18807	0		following_newlines += line.str[line.len] == '\n';
18808	0		line.len++;
18809			}
18810	0		text.str += line.len, text.len -= line.len;
18811
18812			// Add all tokens from the line to `s'
18813	0	0	tokenizer->set_text(line, false);
18814	0	0	while (tokenizer->next_sentence(partial, error)) {
		0
18815			// Append words
18816	0		size_t words = s.words.size() - 1;
18817	0	0	for (size_t i = 1; i < partial.words.size(); i++) {
18818	0		s.words.push_back(move(partial.words[i]));
18819	0		s.words.back().id += words;
18820	0	0	if (s.words.back().head > 0) s.words.back().head += words;
18821			}
18822
18823			// Append multiword_tokens
18824	0	0	for (auto&& multiword_token : partial.multiword_tokens) {
18825	0		s.multiword_tokens.push_back(move(multiword_token));
18826	0		s.multiword_tokens.back().id_first += words;
18827	0		s.multiword_tokens.back().id_last += words;
18828			}
18829
18830			// Append empty nodes
18831	0	0	for (auto&& empty_node : partial.empty_nodes) {
18832	0		s.empty_nodes.push_back(move(empty_node));
18833	0		s.empty_nodes.back().id += words;
18834			}
18835			}
18836	0	0	if (!error.empty()) return false;
18837
18838	0	0	if (s.empty()) {
18839	0		preceeding_newlines += following_newlines;
18840			following_newlines = 0;
18841			}
18842			}
18843
18844	0	0	if (!s.empty()) {
18845			// Mark new document if needed
18846	0	0	if (new_document)
18847	0	0	s.set_new_doc(true, document_id);
18848	0		new_document = false;
18849
18850			// Mark new paragraph if needed
18851	0	0	if (preceeding_newlines >= 2)
18852	0	0	s.set_new_par(true);
18853	0		preceeding_newlines = following_newlines;
18854
18855			// Sentence id
18856	0	0	s.set_sent_id(to_string(sentence_id++));
18857
18858			// Fill "# text" comment
18859	0	0	s.comments.emplace_back("# text = ");
18860	0	0	for (size_t i = 1, j = 0; i < s.words.size(); i++) {
18861	0	0	const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
		0
18862	0	0	if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
		0
		0
18863	0		i = s.multiword_tokens[j++].id_last;
18864
18865			s.comments.back().append(tok.form);
18866	0	0	if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
		0
		0
		0
		0
18867			}
18868			}
18869
18870	0		return !s.empty();
18871			}
18872
18873			// Static factory methods
18874	0		input_format* input_format::new_conllu_input_format(const string& options) {
18875			named_values::map parsed_options;
18876			string parse_error;
18877	0	0	if (!named_values::parse(options, parsed_options, parse_error))
		0
18878			return nullptr;
18879
18880			unsigned version = 2;
18881	0	0	if (parsed_options.count(CONLLU_V1))
18882			version = 1;
18883	0	0	if (parsed_options.count(CONLLU_V2))
18884			version = 2;
18885
18886	0	0	return new input_format_conllu(version);
18887			}
18888
18889	0		input_format* input_format::new_generic_tokenizer_input_format(const string& options) {
18890			named_values::map parsed_options;
18891			string parse_error;
18892	0	0	if (!named_values::parse(options, parsed_options, parse_error))
		0
18893			return nullptr;
18894
18895	0		bool normalized_spaces = parsed_options.count(GENERIC_TOKENIZER_NORMALIZED_SPACES);
18896	0		bool token_ranges = parsed_options.count(GENERIC_TOKENIZER_RANGES);
18897
18898	0	0	input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges);
		0
		0
18899	0	0	return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result;
		0
		0
18900			}
18901
18902	0		input_format* input_format::new_horizontal_input_format(const string& /options/) {
18903	0		return new input_format_horizontal();
18904			}
18905
18906	0		input_format* input_format::new_vertical_input_format(const string& /options/) {
18907	0		return new input_format_vertical();
18908			}
18909
18910	0		input_format* input_format::new_input_format(const string& name) {
18911	0		size_t equal = name.find('=');
18912	0	0	size_t name_len = equal != string::npos ? equal : name.size();
18913	0	0	size_t option_offset = equal != string::npos ? equal + 1 : name.size();
18914
18915	0	0	if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset));
		0
18916	0	0	if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset));
		0
18917	0	0	if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset));
		0
18918	0	0	if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset));
		0
18919			return nullptr;
18920			}
18921
18922	0		input_format* input_format::new_presegmented_tokenizer(input_format* tokenizer) {
18923	0		return new input_format_presegmented_tokenizer(tokenizer);
18924			}
18925
18926			/////////
18927			// File: utils/xml_encoded.h
18928			/////////
18929
18930			// This file is part of UFAL C++ Utils .
18931			//
18932			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18933			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18934			//
18935			// This Source Code Form is subject to the terms of the Mozilla Public
18936			// License, v. 2.0. If a copy of the MPL was not distributed with this
18937			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18938
18939			namespace utils {
18940
18941			//
18942			// Declarations
18943			//
18944
18945			// Print xml content while encoding <>& and optionally " using XML entities.
18946			class xml_encoded {
18947			public:
18948	0		xml_encoded(string_piece str, bool encode_quot = false) : str(str), encode_quot(encode_quot) {}
18949
18950			friend ostream& operator<<(ostream& os, xml_encoded data);
18951			private:
18952			string_piece str;
18953			bool encode_quot;
18954			};
18955
18956			inline ostream& operator<<(ostream& os, xml_encoded data);
18957
18958			//
18959			// Definitions
18960			//
18961
18962	0		ostream& operator<<(ostream& os, xml_encoded data) {
18963			string_piece& str = data.str;
18964			const char* to_print = str.str;
18965
18966	0	0	while (str.len) {
18967	0	0	while (str.len && str.str != '<' && str.str != '>' && str.str != '&' && (!data.encode_quot \|\| str.str != '"'))
		0
		0
		0
		0
		0
18968	0		str.str++, str.len--;
18969
18970	0	0	if (str.len) {
18971	0	0	if (to_print < str.str) os.write(to_print, str.str - to_print);
18972	0	0	os << (str.str == '<' ? "<" : str.str == '>' ? ">" : *str.str == '&' ? "&" : """);
		0
		0
18973	0		str.str++, str.len--;
18974			to_print = str.str;
18975			}
18976			}
18977
18978	0	0	if (to_print < str.str) os.write(to_print, str.str - to_print);
18979
18980	0		return os;
18981			}
18982
18983			} // namespace utils
18984
18985			/////////
18986			// File: sentence/output_format.cpp
18987			/////////
18988
18989			// This file is part of UDPipe .
18990			//
18991			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
18992			// Mathematics and Physics, Charles University in Prague, Czech Republic.
18993			//
18994			// This Source Code Form is subject to the terms of the Mozilla Public
18995			// License, v. 2.0. If a copy of the MPL was not distributed with this
18996			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
18997
18998	2		const string output_format::CONLLU_V1 = "v1";
18999	2		const string output_format::CONLLU_V2 = "v2";
19000	2		const string output_format::HORIZONTAL_PARAGRAPHS = "paragraphs";
19001	2		const string output_format::PLAINTEXT_NORMALIZED_SPACES = "normalized_spaces";
19002	2		const string output_format::VERTICAL_PARAGRAPHS = "paragraphs";
19003
19004			// CoNLL-U output format
19005	2		class output_format_conllu : public output_format {
19006			public:
19007	1		output_format_conllu(unsigned version) : version(version) {}
19008
19009			virtual void write_sentence(const sentence& s, ostream& os) override;
19010
19011			private:
19012			unsigned version;
19013			static const string underscore;
19014	14	0	const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; }
		100
		50
		50
		50
		100
		50
		50
		0
		0
		0
		0
		0
		0
19015			ostream& write_with_spaces(ostream& os, const string& str);
19016			};
19017
19018	2		const string output_format_conllu::underscore = "_";
19019
19020	2		void output_format_conllu::write_sentence(const sentence& s, ostream& os) {
19021			// Comments
19022	5	100	for (auto&& comment : s.comments)
19023			os << comment << '\n';
19024
19025			// Words and multiword tokens
19026			size_t multiword_token = 0, empty_node = 0;
19027	9	100	for (int i = 0; i < int(s.words.size()); i++) {
19028			// Write non-root nodes
19029	8	100	if (i > 0) {
19030			// Multiword token if present
19031	7	50	if (multiword_token < s.multiword_tokens.size() &&
		0
		50
19032	0		i == s.multiword_tokens[multiword_token].id_first) {
19033	0		os << s.multiword_tokens[multiword_token].id_first << '-'
19034	0		<< s.multiword_tokens[multiword_token].id_last << '\t';
19035	0		write_with_spaces(os, s.multiword_tokens[multiword_token].form) << "\t_\t_\t_\t_\t_\t_\t_\t"
19036	0		<< underscore_on_empty(s.multiword_tokens[multiword_token].misc) << '\n';
19037	0		multiword_token++;
19038			}
19039
19040			// Write the word
19041	7		os << i << '\t';
19042	7		write_with_spaces(os, s.words[i].form) << '\t';
19043	7		write_with_spaces(os, underscore_on_empty(s.words[i].lemma)) << '\t'
19044	7		<< underscore_on_empty(s.words[i].upostag) << '\t'
19045	7		<< underscore_on_empty(s.words[i].xpostag) << '\t'
19046	7		<< underscore_on_empty(s.words[i].feats) << '\t';
19047	7	50	if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t'
19048	7		<< underscore_on_empty(s.words[i].deprel) << '\t'
19049	7		<< underscore_on_empty(s.words[i].deps) << '\t'
19050	7		<< underscore_on_empty(s.words[i].misc) << '\n';
19051			}
19052
19053			// Empty nodes
19054	8	50	if (version >= 2)
19055	8	50	for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) {
		0
		50
19056	0		os << i << '.' << s.empty_nodes[empty_node].index << '\t'
19057			<< s.empty_nodes[empty_node].form << '\t'
19058	0		<< underscore_on_empty(s.empty_nodes[empty_node].lemma) << '\t'
19059	0		<< underscore_on_empty(s.empty_nodes[empty_node].upostag) << '\t'
19060	0		<< underscore_on_empty(s.empty_nodes[empty_node].xpostag) << '\t'
19061	0		<< underscore_on_empty(s.empty_nodes[empty_node].feats) << '\t'
19062			<< "_\t"
19063			<< "_\t"
19064	0		<< underscore_on_empty(s.empty_nodes[empty_node].deps) << '\t'
19065	0		<< underscore_on_empty(s.empty_nodes[empty_node].misc) << '\n';
19066			}
19067			}
19068			os << endl;
19069	1		}
19070
19071	14		ostream& output_format_conllu::write_with_spaces(ostream& os, const string& str) {
19072	14	50	if (version >= 2 \|\| str.find(' ') == string::npos)
		0
		50
19073			os << str;
19074			else
19075	0	0	for (auto&& chr : str)
19076	0	0	os << (chr == ' ' ? '_' : chr);
19077
19078	14		return os;
19079			}
19080
19081			// EPE output format
19082	0		class output_format_epe : public output_format {
19083			public:
19084			virtual void write_sentence(const sentence& s, ostream& os) override;
19085			virtual void finish_document(ostream& os) override;
19086
19087			private:
19088	0		class json_builder {
19089			public:
19090	0		json_builder& object() { comma(); json.push_back('{'); stack.push_back('}'); return *this; }
19091	0		json_builder& array() { comma(); json.push_back('['); stack.push_back(']'); return *this; }
19092	0	0	json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; }
		0
19093	0		json_builder& key(string_piece name) { comma(); string(name); json.push_back(':'); return *this; }
19094	0	0	json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; }
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
		0
19095	0	0	json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; }
		0
		0
		0
		0
		0
		0
		0
		0
		0
19096	0		json_builder& value_true() { comma(); json.push_back('t'); json.push_back('r'); json.push_back('u'); json.push_back('e'); comma_needed=true; return *this; }
19097
19098			string_piece current() const { return string_piece(json.data(), json.size()); }
19099	0		void clear() { json.clear(); stack.clear(); comma_needed=false; }
19100
19101			private:
19102	0		void comma() {
19103	0	0	if (comma_needed) {
19104	0		json.push_back(',');
19105	0		json.push_back(' ');
19106			}
19107	0		comma_needed = false;
19108	0		}
19109	0		void string(string_piece str) {
19110	0		json.push_back('"');
19111	0	0	for (; str.len; str.str++, str.len--)
19112	0		switch (*str.str) {
19113	0		case '"': json.push_back('\\'); json.push_back('\"'); break;
19114	0		case '\\': json.push_back('\\'); json.push_back('\\'); break;
19115	0		case '\b': json.push_back('\\'); json.push_back('b'); break;
19116	0		case '\f': json.push_back('\\'); json.push_back('f'); break;
19117	0		case '\n': json.push_back('\\'); json.push_back('n'); break;
19118	0		case '\r': json.push_back('\\'); json.push_back('r'); break;
19119	0		case '\t': json.push_back('\\'); json.push_back('t'); break;
19120			default:
19121	0	0	if (((unsigned char)*str.str) < 32) {
19122	0		json.push_back('u'); json.push_back('0'); json.push_back('0'); json.push_back('0' + (str.str >> 4)); json.push_back("0123456789ABCDEF"[str.str & 0xF]);
19123			} else {
19124	0		json.push_back(*str.str);
19125			}
19126			}
19127	0		json.push_back('"');
19128	0		}
19129	0		void number(size_t value) {
19130			size_t start_size = json.size();
19131	0	0	for (; value \|\| start_size == json.size(); value /= 10)
		0
		0
19132	0		json.push_back('0' + (value % 10));
19133			reverse(json.begin() + start_size, json.end());
19134	0		}
19135
19136			std::vector json;
19137			std::vector stack;
19138			bool comma_needed = false;
19139			} json;
19140
19141			vector feats;
19142			size_t sentences = 0;
19143			};
19144
19145	0		void output_format_epe::write_sentence(const sentence& s, ostream& os) {
19146	0	0	json.object().key("id").value(++sentences).key("nodes").array();
		0
		0
		0
19147
19148	0	0	for (size_t i = 1; i < s.words.size(); i++) {
19149	0	0	json.object().key("id").value(i).key("form").value(s.words[i].form);
		0
		0
19150
19151			size_t start, end;
19152	0	0	if (s.words[i].get_token_range(start, end))
19153	0	0	json.key("start").value(start).key("end").value(end);
		0
19154	0	0	if (s.words[i].head == 0)
19155	0		json.key("top").value_true();
19156
19157	0	0	json.key("properties").object()
		0
19158	0	0	.key("lemma").value(s.words[i].lemma)
19159	0	0	.key("upos").value(s.words[i].upostag)
19160	0	0	.key("xpos").value(s.words[i].xpostag);
19161	0		split(s.words[i].feats, '\|', feats);
19162	0	0	for (auto&& feat : feats) {
19163	0		string_piece key(feat.str, 0);
19164	0	0	while (key.len < feat.len && key.str[key.len] != '=')
		0
19165	0		key.len++;
19166	0	0	if (key.len + 1 < feat.len)
19167	0	0	json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1));
19168			}
19169	0		json.close();
19170
19171	0	0	if (!s.words[i].children.empty()) {
19172	0		json.key("edges").array();
19173	0	0	for (auto&& child : s.words[i].children)
19174	0	0	json.object().key("label").value(s.words[child].deprel).key("target").value(child).close();
		0
		0
		0
19175	0		json.close();
19176			}
19177
19178	0		json.close();
19179			}
19180	0		json.close().close();
19181
19182			string_piece current = json.current();
19183	0		os.write(current.str, current.len).put('\n');
19184			json.clear();
19185	0		}
19186
19187	0		void output_format_epe::finish_document(ostream& /os/) {
19188	0		sentences = 0;
19189	0		}
19190
19191			// Matxin output format
19192	0		class output_format_matxin : public output_format {
19193			public:
19194			virtual void write_sentence(const sentence& s, ostream& os) override;
19195			virtual void finish_document(ostream& os) override;
19196
19197			private:
19198			void write_node(const sentence& s, int node, string& pad, ostream& os);
19199
19200			int sentences = 0;
19201			};
19202
19203	0		void output_format_matxin::write_sentence(const sentence& s, ostream& os) {
19204	0	0	if (!sentences) {
19205	0		os << "";
19206			}
19207	0		os << "\n\n";
19208
19209			string pad;
19210	0	0	for (auto&& node : s.words[0].children)
19211	0	0	write_node(s, node, pad, os);
19212
19213			os << "" << endl;
19214	0		}
19215
19216	0		void output_format_matxin::finish_document(ostream& os) {
19217	0		os << "\n";
19218
19219	0		sentences = 0;
19220	0		}
19221
19222	0		void output_format_matxin::write_node(const sentence& s, int node, string& pad, ostream& os) {
19223			//
19224	0		pad.push_back(' ');
19225
19226	0	0	os << pad << "
		0
19227	0	0	<< "\" form=\"" << xml_encoded(s.words[node].form, true)
19228	0	0	<< "\" lem=\"" << xml_encoded(s.words[node].lemma, true)
19229	0	0	<< "\" mi=\"" << xml_encoded(s.words[node].feats, true)
19230	0	0	<< "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"';
19231
19232	0	0	if (s.words[node].children.empty()) {
19233	0		os << "/>\n";
19234			} else {
19235	0		os << ">\n";
19236	0	0	for (auto&& child : s.words[node].children)
19237	0		write_node(s, child, pad, os);
19238	0		os << pad << "\n";
19239			}
19240
19241			pad.pop_back();
19242	0		}
19243
19244			// Horizontal output format
19245	0		class output_format_horizontal : public output_format {
19246			public:
19247	0		output_format_horizontal(bool paragraphs) : paragraphs(paragraphs), empty(true) {}
19248
19249			virtual void write_sentence(const sentence& s, ostream& os) override;
19250	0		virtual void finish_document(ostream& /os/) override { empty = true; }
19251
19252			private:
19253			bool paragraphs;
19254			bool empty;
19255			};
19256
19257	0		void output_format_horizontal::write_sentence(const sentence& s, ostream& os) {
19258	0	0	if (paragraphs && !empty && (s.get_new_doc() \|\| s.get_new_par()))
		0
		0
		0
		0
19259			os << '\n';
19260	0		empty = false;
19261
19262			string line;
19263	0	0	for (size_t i = 1; i < s.words.size(); i++) {
19264			// Append word, but replace spaces by s
19265	0	0	for (auto&& chr : s.words[i].form)
19266	0	0	if (chr == ' ')
19267	0	0	line.append("\302\240");
19268			else
19269	0	0	line.push_back(chr);
19270
19271	0	0	if (i+1 < s.words.size())
19272	0	0	line.push_back(' ');
19273			}
19274			os << line << endl;
19275	0		}
19276
19277			// Plaintext output format
19278	0		class output_format_plaintext : public output_format {
19279			public:
19280	0		output_format_plaintext(bool normalized): normalized(normalized), empty(true) {}
19281
19282			virtual void write_sentence(const sentence& s, ostream& os) override;
19283	0		virtual void finish_document(ostream& /os/) override { empty = true; }
19284			private:
19285			bool normalized;
19286			bool empty;
19287			};
19288
19289	0		void output_format_plaintext::write_sentence(const sentence& s, ostream& os) {
19290	0	0	if (normalized) {
19291	0	0	if (!empty && (s.get_new_doc() \|\| s.get_new_par()))
		0
		0
		0
19292			os << '\n';
19293	0	0	for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19294	0	0	const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
		0
19295			os << tok.form;
19296	0	0	if (i+1 < s.words.size() && tok.get_space_after())
		0
		0
19297			os << ' ';
19298	0	0	if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
		0
		0
19299	0		i = s.multiword_tokens[j++].id_last;
19300			}
19301			os << endl;
19302			} else {
19303			string spaces;
19304	0	0	for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19305	0	0	const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i];
		0
19306	0	0	tok.get_spaces_before(spaces); os << spaces;
19307	0	0	tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form);
		0
19308	0	0	tok.get_spaces_after(spaces); os << spaces;
19309	0	0	if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
		0
		0
19310	0		i = s.multiword_tokens[j++].id_last;
19311			}
19312			os << flush;
19313			}
19314	0		empty = false;
19315	0		}
19316
19317			// Vertical output format
19318	0		class output_format_vertical : public output_format {
19319			public:
19320	0		output_format_vertical(bool paragraphs) : paragraphs(paragraphs), empty(true) {}
19321
19322			virtual void write_sentence(const sentence& s, ostream& os) override;
19323	0		virtual void finish_document(ostream& /os/) override { empty = true; }
19324
19325			private:
19326			bool paragraphs;
19327			bool empty;
19328			};
19329
19330	0		void output_format_vertical::write_sentence(const sentence& s, ostream& os) {
19331	0	0	if (paragraphs && !empty && (s.get_new_doc() \|\| s.get_new_par()))
		0
		0
		0
		0
19332			os << '\n';
19333	0		empty = false;
19334
19335	0	0	for (size_t i = 1; i < s.words.size(); i++)
19336			os << s.words[i].form << '\n';
19337			os << endl;
19338	0		}
19339
19340			// Static factory methods
19341	1		output_format* output_format::new_conllu_output_format(const string& options) {
19342			named_values::map parsed_options;
19343			string parse_error;
19344	1	50	if (!named_values::parse(options, parsed_options, parse_error))
		50
19345			return nullptr;
19346
19347			unsigned version = 2;
19348	1	50	if (parsed_options.count(CONLLU_V1))
19349			version = 1;
19350	1	50	if (parsed_options.count(CONLLU_V2))
19351			version = 2;
19352
19353	1	50	return new output_format_conllu(version);
19354			}
19355
19356	0		output_format* output_format::new_epe_output_format(const string& /options/) {
19357	0		return new output_format_epe();
19358			}
19359
19360	0		output_format* output_format::new_matxin_output_format(const string& /options/) {
19361	0	0	return new output_format_matxin();
19362			}
19363
19364	0		output_format* output_format::new_horizontal_output_format(const string& options) {
19365			named_values::map parsed_options;
19366			string parse_error;
19367	0	0	if (!named_values::parse(options, parsed_options, parse_error))
		0
19368			return nullptr;
19369
19370	0	0	return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS));
19371			}
19372
19373	0		output_format* output_format::new_plaintext_output_format(const string& options) {
19374			named_values::map parsed_options;
19375			string parse_error;
19376	0	0	if (!named_values::parse(options, parsed_options, parse_error))
		0
19377			return nullptr;
19378
19379	0	0	return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES));
19380			}
19381
19382	0		output_format* output_format::new_vertical_output_format(const string& options) {
19383			named_values::map parsed_options;
19384			string parse_error;
19385	0	0	if (!named_values::parse(options, parsed_options, parse_error))
		0
19386			return nullptr;
19387
19388	0	0	return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS));
19389			}
19390
19391	1		output_format* output_format::new_output_format(const string& name) {
19392	1		size_t equal = name.find('=');
19393	1	50	size_t name_len = equal != string::npos ? equal : name.size();
19394	1	50	size_t option_offset = equal != string::npos ? equal + 1 : name.size();
19395
19396	2	50	if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset));
		50
19397	0	0	if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset));
		0
19398	0	0	if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset));
19399	0	0	if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset));
		0
19400	0	0	if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset));
		0
19401	1	0	if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset));
		0
19402			return nullptr;
19403			}
19404
19405			/////////
19406			// File: sentence/sentence.cpp
19407			/////////
19408
19409			// This file is part of UDPipe .
19410			//
19411			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
19412			// Mathematics and Physics, Charles University in Prague, Czech Republic.
19413			//
19414			// This Source Code Form is subject to the terms of the Mozilla Public
19415			// License, v. 2.0. If a copy of the MPL was not distributed with this
19416			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
19417
19418	2		const string sentence::root_form = "";
19419
19420	1		sentence::sentence() {
19421	1	50	clear();
19422	1		}
19423
19424	0		bool sentence::empty() {
19425	0		return words.size() == 1;
19426			}
19427
19428	3		void sentence::clear() {
19429			words.clear();
19430			multiword_tokens.clear();
19431			empty_nodes.clear();
19432	3		comments.clear();
19433
19434			word& root = add_word(root_form);
19435	12		root.lemma = root.upostag = root.xpostag = root.feats = root_form;
19436	3		}
19437
19438	0		word& sentence::add_word(string_piece form) {
19439	10	0	words.emplace_back((int)words.size(), form);
		0
		0
19440	0		return words.back();
19441			}
19442
19443	7		void sentence::set_head(int id, int head, const string& deprel) {
19444	7	50	assert(id >= 0 && id < int(words.size()));
		50
19445	7	50	assert(head < int(words.size()));
19446
19447			// Remove existing head
19448	7	50	if (words[id].head >= 0) {
19449	0		auto& children = words[words[id].head].children;
19450	0	0	for (size_t i = children.size(); i && children[i-1] >= id; i--)
		0
		0
19451	0	0	if (children[i-1] == id) {
19452			children.erase(children.begin() + i - 1);
19453	0		break;
19454			}
19455			}
19456
19457			// Set new head
19458	14		words[id].head = head;
19459	7		words[id].deprel = deprel;
19460	7	50	if (head >= 0) {
19461	14		auto& children = words[head].children;
19462			size_t i = children.size();
19463	7	100	while (i && children[i-1] > id) i--;
		50
		50
19464	7	100	if (!i \|\| children[i-1] < id) children.insert(children.begin() + i, id);
		50
		50
19465			}
19466	7		}
19467
19468	0		void sentence::unlink_all_words() {
19469	0	0	for (auto&& word : words) {
19470	0		word.head = -1;
19471			word.deprel.clear();
19472			word.children.clear();
19473			}
19474	0		}
19475
19476	0		bool sentence::get_new_doc(string* id) const {
19477	0	0	if (get_comment("newdoc id", id))
19478			return true;
19479	0		return get_comment("newdoc", id);
19480			}
19481
19482	1		void sentence::set_new_doc(bool new_doc, string_piece id) {
19483	1		remove_comment("newdoc");
19484	1		remove_comment("newdoc id");
19485
19486	1	50	if (new_doc && id.len)
		50
19487	0		set_comment("newdoc id", id);
19488	1	50	else if (new_doc)
19489	1		set_comment("newdoc");
19490	1		}
19491
19492	0		bool sentence::get_new_par(string* id) const {
19493	0	0	if (get_comment("newpar id", id))
19494			return true;
19495	0		return get_comment("newpar", id);
19496			}
19497
19498	1		void sentence::set_new_par(bool new_par, string_piece id) {
19499	1		remove_comment("newpar");
19500	1		remove_comment("newpar id");
19501
19502	1	50	if (new_par && id.len)
		50
19503	0		set_comment("newpar id", id);
19504	1	50	else if (new_par)
19505	1		set_comment("newpar");
19506	1		}
19507
19508	0		bool sentence::get_sent_id(string& id) const {
19509			id.clear();
19510
19511	0		return get_comment("sent_id", &id);
19512			}
19513
19514	1		void sentence::set_sent_id(string_piece id) {
19515	1		remove_comment("sent_id");
19516
19517	1	50	if (id.len)
19518	1		set_comment("sent_id", id);
19519	1		}
19520
19521	0		bool sentence::get_text(string& text) const {
19522			text.clear();
19523
19524	0		return get_comment("text", &text);
19525			}
19526
19527	0		void sentence::set_text(string_piece text) {
19528	0		remove_comment("text");
19529
19530	0	0	if (text.len)
19531	0		set_comment("text", text);
19532	0		}
19533
19534	0		bool sentence::get_comment(string_piece name, string* value) const {
19535	0	0	for (auto&& comment : comments)
19536	0	0	if (comment[0] == '#') {
19537			// Skip spaces
19538			unsigned j = 1;
19539	0	0	while (j < comment.size() && (comment[j] == ' ' \|\| comment[j] == '\t')) j++;
		0
		0
		0
19540
19541			// Try matching the name
19542	0	0	if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) {
		0
		0
19543	0		j += name.len;
19544	0	0	while (j < comment.size() && (comment[j] == ' ' \|\| comment[j] == '\t')) j++;
		0
		0
		0
19545	0	0	if (j < comment.size() && comment[j] == '=') {
		0
		0
19546			//We have a value
19547	0		j++;
19548	0	0	while (j < comment.size() && (comment[j] == ' ' \|\| comment[j] == '\t')) j++;
		0
		0
		0
19549	0	0	if (value) value->assign(comment, j, comment.size() - j);
19550			} else {
19551			// No value
19552	0	0	if (value) value->clear();
19553			}
19554
19555			return true;
19556			}
19557			}
19558
19559			return false;
19560			}
19561
19562	8		void sentence::remove_comment(string_piece name) {
19563	15	100	for (unsigned i = comments.size(); i--; )
19564	7	50	if (comments[i][0] == '#') {
19565			// Skip spaces
19566			unsigned j = 1;
19567	14	50	while (j < comments[i].size() && (comments[i][j] == ' ' \|\| comments[i][j] == '\t')) j++;
		100
		50
		100
19568
19569			// Remove matching comments
19570	7	100	if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0)
		50
		50
19571	0		comments.erase(comments.begin() + i);
19572			}
19573	8		}
19574
19575	3		void sentence::set_comment(string_piece name, string_piece value) {
19576	3		remove_comment(name);
19577
19578			string comment;
19579	3	50	comment.append("# ").append(name.str, name.len);
		50
19580	3	100	if (value.len) {
19581	1	50	comment.append(" = ");
19582	2	100	for (size_t i = 0; i < value.len; i++)
19583	1	50	comment.push_back(value.str[i] == '\r' \|\| value.str[i] == '\n' ? ' ' : value.str[i]);
		50
19584			}
19585	3		comments.push_back(move(comment));
19586	3		}
19587
19588			/////////
19589			// File: sentence/token.cpp
19590			/////////
19591
19592			// This file is part of UDPipe .
19593			//
19594			// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of
19595			// Mathematics and Physics, Charles University in Prague, Czech Republic.
19596			//
19597			// This Source Code Form is subject to the terms of the Mozilla Public
19598			// License, v. 2.0. If a copy of the MPL was not distributed with this
19599			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
19600
19601	11		token::token(string_piece form, string_piece misc) {
19602	11	100	if (form.len) this->form.assign(form.str, form.len);
19603	11	50	if (misc.len) this->misc.assign(misc.str, misc.len);
19604	11		}
19605
19606			// CoNLL-U defined SpaceAfter=No feature
19607	6		bool token::get_space_after() const {
19608			string_piece value;
19609
19610	6	100	return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0);
		50
		50
19611			}
19612
19613	7		void token::set_space_after(bool space_after) {
19614	7	100	if (space_after)
19615	5		remove_misc_field("SpaceAfter");
19616			else
19617	2		start_misc_field("SpaceAfter").append("No");
19618	7		}
19619
19620			// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
19621	0		void token::get_spaces_before(string& spaces_before) const {
19622			string_piece value;
19623
19624	0	0	if (get_misc_field("SpacesBefore", value))
19625	0		unescape_spaces(value, spaces_before);
19626			else
19627			spaces_before.clear();
19628	0		}
19629
19630	7		void token::set_spaces_before(string_piece spaces_before) {
19631	7	50	if (spaces_before.len == 0)
19632	7		remove_misc_field("SpacesBefore");
19633			else
19634	0		append_escaped_spaces(spaces_before, start_misc_field("SpacesBefore"));
19635	7		}
19636
19637	0		void token::get_spaces_after(string& spaces_after) const {
19638			string_piece value;
19639
19640	0	0	if (get_misc_field("SpacesAfter", value))
19641	0		unescape_spaces(value, spaces_after);
19642			else
19643	0	0	spaces_after.assign(get_space_after() ? " " : "");
19644	0		}
19645
19646	7		void token::set_spaces_after(string_piece spaces_after) {
19647	7	100	if (spaces_after.len == 0) {
19648	2		set_space_after(false);
19649	2		remove_misc_field("SpacesAfter");
19650	5	50	} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') {
		50
19651	5		set_space_after(true);
19652	5		remove_misc_field("SpacesAfter");
19653			} else {
19654	0		set_space_after(true);
19655	0		append_escaped_spaces(spaces_after, start_misc_field("SpacesAfter"));
19656			}
19657	7		}
19658
19659	0		void token::get_spaces_in_token(string& spaces_in_token) const {
19660			string_piece value;
19661
19662	0	0	if (get_misc_field("SpacesInToken", value))
19663	0		unescape_spaces(value, spaces_in_token);
19664			else
19665			spaces_in_token.clear();
19666	0		}
19667
19668	7		void token::set_spaces_in_token(string_piece spaces_in_token) {
19669	7	50	if (spaces_in_token.len == 0)
19670	7		remove_misc_field("SpacesInToken");
19671			else
19672	0		append_escaped_spaces(spaces_in_token, start_misc_field("SpacesInToken"));
19673	7		}
19674
19675			// UDPipe-specific TokenRange feature
19676	0		bool token::get_token_range(size_t& start, size_t& end) const {
19677			string_piece value;
19678
19679	0	0	if (!get_misc_field("TokenRange", value)) return false;
19680
19681	0		start = 0;
19682	0	0	while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
		0
		0
19683	0	0	if (start > (numeric_limits::max() - (value.str[0] - '0')) / 10)
19684			return false;
19685	0		start = 10 * start + (value.str[0] - '0');
19686	0		value.str++, value.len--;
19687			}
19688
19689	0	0	if (value.len == 0 \|\| value.str[0] != ':') return false;
		0
19690	0		value.str++, value.len--;
19691
19692	0		end = 0;
19693	0	0	while (value.len && value.str[0] >= '0' && value.str[0] <= '9') {
		0
		0
19694	0	0	if (end > (numeric_limits::max() - (value.str[0] - '0')) / 10)
19695			return false;
19696	0		end = 10 * end + (value.str[0] - '0');
19697	0		value.str++, value.len--;
19698			}
19699
19700			return true;
19701			}
19702
19703	0		void token::set_token_range(size_t start, size_t end) {
19704	0	0	if (start == size_t(string::npos))
19705	0		remove_misc_field("TokenRange");
19706			else
19707	0	0	start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end));
19708	0		}
19709
19710			// Private MISC field helpers
19711	12		bool token::get_misc_field(string_piece name, string_piece& value) const {
19712	6	100	for (size_t index = 0; index < misc.size(); ) {
19713	2	50	if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
		50
		50
19714	2		index += name.len + 1;
19715	2		value.str = misc.c_str() + index;
19716	2		value.len = misc.find('\|', index);
19717	2	50	value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index;
19718	2		return true;
19719			}
19720	0		index = misc.find('\|', index);
19721	0	0	if (index != size_t(string::npos)) index++;
19722			}
19723			return false;
19724			}
19725
19726	64		void token::remove_misc_field(string_piece name) {
19727	36	100	for (size_t index = 0; index < misc.size(); )
19728	8	100	if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') {
		50
		100
19729	2		size_t end_index = misc.find('\|', index + name.len + 1);
19730	2	50	if (end_index == size_t(string::npos)) end_index = misc.size();
19731
19732			// Be careful to delete at most one neighboring '\|'
19733	2	50	if (index)
19734	0		misc.erase(index - 1, end_index - (index - 1));
19735			else
19736	2	50	misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index);
19737			} else {
19738	6		index = misc.find('\|', index);
19739	6	50	if (index != size_t(string::npos)) index++;
19740			}
19741	28		}
19742
19743	2		string& token::start_misc_field(string_piece name) {
19744	2		remove_misc_field(name);
19745	2	50	if (!misc.empty()) misc.push_back('\|');
19746	2		misc.append(name.str, name.len).push_back('=');
19747	2		return misc;
19748			}
19749
19750	0		void token::append_escaped_spaces(string_piece spaces, string& escaped_spaces) const {
19751	0	0	for (unsigned i = 0; i < spaces.len; i++)
19752	0		switch (spaces.str[i]) {
19753			case ' ':
19754	0		escaped_spaces.push_back('\\'); escaped_spaces.push_back('s'); break;
19755			case '\|':
19756	0		escaped_spaces.push_back('\\'); escaped_spaces.push_back('p'); break;
19757			case '\t':
19758	0		escaped_spaces.push_back('\\'); escaped_spaces.push_back('t'); break;
19759			case '\r':
19760	0		escaped_spaces.push_back('\\'); escaped_spaces.push_back('r'); break;
19761			case '\n':
19762	0		escaped_spaces.push_back('\\'); escaped_spaces.push_back('n'); break;
19763			case '\\':
19764	0		escaped_spaces.push_back('\\'); escaped_spaces.push_back('\\'); break;
19765			default:
19766	0		escaped_spaces.push_back(spaces.str[i]);
19767			}
19768	0		}
19769
19770	0		void token::unescape_spaces(string_piece escaped_spaces, string& spaces) const {
19771			spaces.clear();
19772
19773	0	0	for (unsigned i = 0; i < escaped_spaces.len; i++)
19774	0	0	if (escaped_spaces.str[i] != '\\' \|\| i+1 >= escaped_spaces.len)
		0
19775	0		spaces.push_back(escaped_spaces.str[i]);
19776	0		else switch (escaped_spaces.str[++i]) {
19777			case 's':
19778	0		spaces.push_back(' '); break;
19779			case 'p':
19780	0		spaces.push_back('\|'); break;
19781			case 't':
19782	0		spaces.push_back('\t'); break;
19783			case 'r':
19784	0		spaces.push_back('\r'); break;
19785			case 'n':
19786	0		spaces.push_back('\n'); break;
19787			case '\\':
19788	0		spaces.push_back('\\'); break;
19789			default:
19790	0		spaces.push_back(escaped_spaces.str[i - 1]);
19791	0		spaces.push_back(escaped_spaces.str[i]);
19792			}
19793	0		}
19794
19795			/////////
19796			// File: tokenizer/detokenizer.h
19797			/////////
19798
19799			// This file is part of UDPipe .
19800			//
19801			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19802			// Mathematics and Physics, Charles University in Prague, Czech Republic.
19803			//
19804			// This Source Code Form is subject to the terms of the Mozilla Public
19805			// License, v. 2.0. If a copy of the MPL was not distributed with this
19806			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
19807
19808	0		class detokenizer {
19809			public:
19810			detokenizer(const string& plain_text);
19811
19812			void detokenize(sentence& s) const;
19813			private:
19814			enum { LOWERCASE, CATEGORIZE, TOTAL };
19815
19816			int difference(const string& left, const string& right, bool separate, int mode) const;
19817
19818			static string perform_lowercase(const string& input);
19819			static string perform_categorize(const string& input);
19820			bool has_letters(const string& word) const;
19821			bool only_digits(const string& word) const;
19822
19823	0		class suffix_array {
19824			public:
19825			suffix_array(const string& str);
19826			suffix_array(suffix_array&& other) = default;
19827
19828			unsigned count(const string& data) const;
19829
19830			private:
19831			vector sa;
19832
19833			struct suffix_compare {
19834	0		suffix_compare(const string& str) : str(str) {}
19835	0		bool operator()(unsigned a, unsigned b) const { return str.compare(a, string::npos, str, b, string::npos) < 0; }
19836			private:
19837			const string& str;
19838			} suffix_comparator;
19839
19840			struct suffix_lower_find {
19841	0		suffix_lower_find(const string& str) : str(str) {}
19842	0		bool operator()(unsigned a, const string& data) const { return str.compare(a, data.size(), data) < 0; }
19843
19844			private:
19845			const string& str;
19846			} suffix_lower_finder;
19847
19848			struct suffix_upper_find {
19849	0		suffix_upper_find(const string& str) : str(str) {}
19850	0		bool operator()(const string& data, unsigned a) const { return str.compare(a, data.size(), data) > 0; }
19851
19852			private:
19853			const string& str;
19854			} suffix_upper_finder;
19855			};
19856
19857			string data_lowercased, data_categorized;
19858			suffix_array sa_lowercased, sa_categorized;
19859			};
19860
19861			/////////
19862			// File: tokenizer/detokenizer.cpp
19863			/////////
19864
19865			// This file is part of UDPipe .
19866			//
19867			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
19868			// Mathematics and Physics, Charles University in Prague, Czech Republic.
19869			//
19870			// This Source Code Form is subject to the terms of the Mozilla Public
19871			// License, v. 2.0. If a copy of the MPL was not distributed with this
19872			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
19873
19874	0		detokenizer::detokenizer(const string& plain_text)
19875			: data_lowercased(perform_lowercase(plain_text)), data_categorized(perform_categorize(plain_text)),
19876	0	0	sa_lowercased(data_lowercased), sa_categorized(data_categorized) {}
		0
		0
19877
19878	0		void detokenizer::detokenize(sentence& s) const {
19879			token* previous_tok = nullptr;
19880	0	0	for (size_t i = 1, j = 0; i < s.words.size(); i++) {
19881	0	0	token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token)&s.multiword_tokens[j] : (token)&s.words[i];
		0
19882
19883	0	0	if (previous_tok) {
19884			// Should we add SpaceAfter=No to the previous form?
19885	0		int score = difference(previous_tok->form, tok->form, true, LOWERCASE);
19886	0	0	if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0;
		0
		0
19887	0	0	if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0;
		0
		0
19888	0	0	if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE);
19889	0	0	if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE);
19890	0	0	if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE);
19891
19892	0	0	if (score > 0)
19893	0		previous_tok->set_space_after(false);
19894			}
19895
19896			// Remove the SpaceAfter attribute on current token
19897	0		tok->set_space_after(true);
19898			previous_tok = tok;
19899
19900	0	0	if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
		0
		0
19901	0		i = s.multiword_tokens[j++].id_last;
19902			}
19903	0		}
19904
19905	0		int detokenizer::difference(const string& left, const string& right, bool separate, int mode) const {
19906	0	0	auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize;
19907	0	0	auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized;
19908
19909	0		string left_mapped = func(left);
19910	0	0	string right_mapped = func(right);
19911			string pattern;
19912
19913	0	0	pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":"");
		0
		0
		0
19914	0	0	int together = sa.count(pattern);
19915
19916	0	0	pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":"");
		0
		0
		0
		0
19917	0	0	int apart = sa.count(pattern);
19918
19919	0		return together - apart;
19920			}
19921
19922	0		string detokenizer::perform_lowercase(const string& input) {
19923			using namespace unilib;
19924
19925			string output;
19926	0	0	for (auto&& chr : utf8::decoder(input))
19927	0	0	utf8::append(output, unicode::lowercase(chr));
19928	0		return output;
19929			}
19930
19931	0		string detokenizer::perform_categorize(const string& input) {
19932			using namespace unilib;
19933
19934			string output;
19935	0	0	for (auto&& chr : utf8::decoder(input)) {
19936	0		auto category = unicode::category(chr);
19937	0	0	if (category & unicode::C) output.push_back('C');
		0
19938	0	0	if (category & unicode::L) output.push_back('L');
		0
19939	0	0	if (category & unicode::M) output.push_back('M');
		0
19940	0	0	if (category & unicode::N) output.push_back('N');
		0
19941	0	0	if (category & unicode::Pc) output.push_back('c');
		0
19942	0	0	if (category & unicode::Pd) output.push_back('d');
		0
19943	0	0	if (category & unicode::Pe) output.push_back('e');
		0
19944	0	0	if (category & unicode::Pf) output.push_back('f');
		0
19945	0	0	if (category & unicode::Pi) output.push_back('i');
		0
19946	0	0	if (category & unicode::Po) output.push_back('o');
		0
19947	0	0	if (category & unicode::Ps) output.push_back('s');
		0
19948	0	0	if (category & unicode::S) output.push_back('S');
		0
19949	0	0	if (category & unicode::Zl) output.push_back('Z');
		0
19950	0	0	if (category & unicode::Zp) output.push_back('z');
		0
19951	0	0	if (category & unicode::Zs) output.push_back(' ');
		0
19952			}
19953	0		return output;
19954			}
19955
19956	0		bool detokenizer::has_letters(const string& word) const {
19957			using namespace unilib;
19958
19959	0	0	for (auto&& chr : utf8::decoder(word))
19960	0	0	if (unicode::category(chr) & unicode::L)
19961	0		return true;
19962	0		return false;
19963			}
19964
19965	0		bool detokenizer::only_digits(const string& word) const {
19966			using namespace unilib;
19967
19968	0	0	for (auto&& chr : utf8::decoder(word))
19969	0	0	if (unicode::category(chr) & ~unicode::N)
19970	0		return false;
19971	0		return true;
19972			}
19973
19974	0		detokenizer::suffix_array::suffix_array(const string& str) : suffix_comparator(str), suffix_lower_finder(str), suffix_upper_finder(str) {
19975	0	0	sa.reserve(str.size());
19976	0	0	for (unsigned i = 0; i < str.size(); i++)
19977	0	0	sa.push_back(i);
19978
19979			sort(sa.begin(), sa.end(), suffix_comparator);
19980	0		}
19981
19982	0		unsigned detokenizer::suffix_array::count(const string& data) const {
19983			auto lower_it = lower_bound(sa.begin(), sa.end(), data, suffix_lower_finder);
19984			auto upper_it = upper_bound(sa.begin(), sa.end(), data, suffix_upper_finder);
19985	0		return upper_it - lower_it;
19986			}
19987
19988			/////////
19989			// File: tokenizer/morphodita_tokenizer_wrapper.cpp
19990			/////////
19991
19992			// This file is part of UDPipe .
19993			//
19994			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
19995			// Mathematics and Physics, Charles University in Prague, Czech Republic.
19996			//
19997			// This Source Code Form is subject to the terms of the Mozilla Public
19998			// License, v. 2.0. If a copy of the MPL was not distributed with this
19999			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20000
20001	1		morphodita_tokenizer_wrapper::morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter,
20002			bool normalized_spaces, bool token_ranges)
20003	1	50	: tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {}
20004
20005	0		bool morphodita_tokenizer_wrapper::read_block(istream& is, string& block) const {
20006	0		return bool(getpara(is, block));
20007			}
20008
20009	0		void morphodita_tokenizer_wrapper::reset_document(string_piece id) {
20010	0		new_document = true;
20011	0		document_id.assign(id.str, id.len);
20012	0		preceeding_newlines = 2;
20013	0		sentence_id = 1;
20014	0		set_text("");
20015	0		unicode_offset = 0;
20016	0		text_unicode_length = 0;
20017			saved_spaces.clear();
20018	0		}
20019
20020	1		void morphodita_tokenizer_wrapper::set_text(string_piece text, bool make_copy) {
20021			// Start by skipping spaces and copying them to saved_spaces
20022			string_piece following;
20023	1	50	for (char32_t chr;
20024	2	50	text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
		50
20025	1	50	(unilib::unicode::category(chr) & unilib::unicode::Zs) \|\| chr == '\r' \|\| chr == '\n' \|\| chr == '\t');
		50
20026	0		text = following, unicode_offset++)
20027	0		saved_spaces.append(text.str, following.str - text.str);
20028
20029			// Offset unicode_offset by length of previous text, update text_unicode_length for the new text
20030	1		unicode_offset += text_unicode_length;
20031	1		text_unicode_length = 0;
20032	35	100	for (following = text; following.len; unilib::utf8::decode(following.str, following.len))
20033	34		text_unicode_length++;
20034
20035			// Copy the text to local storage if needed
20036	1	50	if (make_copy) {
20037	1		text_copy.assign(text.str, text.len);
20038			text = string_piece(text_copy.c_str(), text_copy.size());
20039			}
20040
20041			// Store the text locally and in the morphodita::tokenizer
20042	1		this->text = text;
20043	1		tokenizer->set_text(this->text, false);
20044
20045	1		}
20046
20047	2		bool morphodita_tokenizer_wrapper::next_sentence(sentence& s, string& error) {
20048			unsigned following_newlines = 0;
20049
20050	2		s.clear();
20051			error.clear();
20052
20053	2	50	if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) {
		100
20054			// The forms returned by GRU tokenizer should not start/end with spaces,
20055			// but we trim them anyway (including all "remove empty forms/sentences" machinery).
20056	8	100	for (size_t i = 0; i < forms.size(); i++) {
20057	14	50	while (forms[i].len && (forms[i].str[0] == '\r' \|\| forms[i].str[0] == '\n' \|\|
		50
		50
		50
		50
20058	7	50	forms[i].str[0] == '\t' \|\| forms[i].str[0] == ' '))
20059	0		forms[i].str++, forms[i].len--;
20060	14	50	while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' \|\| forms[i].str[forms[i].len-1] == '\n' \|\|
		50
		50
		50
		50
20061	7	50	forms[i].str[forms[i].len-1] == '\t' \|\| forms[i].str[forms[i].len-1] == ' '))
20062	0		forms[i].len--;
20063	7	50	if (!forms[i].len)
20064	0		forms.erase(forms.begin() + i--);
20065			}
20066	8	50	if (!forms.size()) return next_sentence(s, error);
20067
20068	8	100	for (size_t i = 0; i < forms.size(); i++) {
20069			// The form might contain spaces, even '\r', '\n' or '\t',
20070			// which we change to space. We also normalize multiple spaces to one.
20071			tok.form.clear();
20072	41	100	for (size_t j = 0; j < forms[i].len; j++) {
20073	34		char chr = forms[i].str[j];
20074	34	50	if (chr == '\r' \|\| chr == '\n' \|\| chr == '\t') chr = ' ';
		50
20075	34	50	if (chr != ' ' \|\| tok.form.empty() \|\| tok.form.back() != ' ')
		0
		0
		50
20076	34		tok.form.push_back(chr);
20077			}
20078
20079			// Track pre-sentence spaces and store SpacesBefore
20080	7	100	if (i == 0) {
20081	1	50	if (forms[0].str > text.str)
20082	0		saved_spaces.append(text.str, forms[0].str - text.str);
20083	1		preceeding_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n');
20084			}
20085	7	50	if (!normalized_spaces) {
20086	15	100	tok.set_spaces_before(i == 0 ? saved_spaces : "");
		50
20087			}
20088			saved_spaces.clear();
20089
20090			// Track post-sentence spaces and store SpaceAfter, SpacesInToken and SpacesAfter
20091	7	100	if (i+1 == forms.size()) {
20092	1		text.len -= forms[i].str + forms[i].len - text.str;
20093	1		text.str = forms[i].str + forms[i].len;
20094
20095			string_piece following;
20096	3	100	for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len),
		50
		100
20097	0	0	(unilib::unicode::category(chr) & unilib::unicode::Zs) \|\| chr == '\r' \|\| chr == '\n' \|\| chr == '\t'); text = following)
		0
20098	1		saved_spaces.append(text.str, following.str - text.str);
20099
20100	1		following_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n');
20101			}
20102	7	50	if (normalized_spaces) {
20103	0	0	tok.set_space_after(i+1 == forms.size() ? !saved_spaces.empty() : forms[i+1].str > forms[i].str + forms[i].len);
20104			} else {
20105	7	50	tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : "");
20106	7	100	tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len));
20107			}
20108			saved_spaces.clear();
20109
20110			// Store TokenRange if requested
20111	7	50	if (token_ranges)
20112	0		tok.set_token_range(unicode_offset + tokens[i].start, unicode_offset + tokens[i].start + tokens[i].length);
20113
20114	7	50	if (splitter)
20115	7		splitter->append_token(tok.form, tok.misc, s);
20116			else
20117	0		s.add_word(tok.form).misc.assign(tok.misc);
20118			}
20119
20120			// Mark new document if needed
20121	1	50	if (new_document) {
20122	1		s.set_new_doc(true, document_id);
20123	1		new_document = false;
20124			}
20125
20126			// Mark new paragraph if needed
20127	1	50	if (preceeding_newlines >= 2)
20128	1		s.set_new_par(true);
20129	1		preceeding_newlines = following_newlines;
20130
20131	1	50	s.set_sent_id(to_string(sentence_id++));
20132
20133			// Fill "# text" comment
20134	8		s.comments.emplace_back("# text = ");
20135	8	100	for (size_t i = 1, j = 0; i < s.words.size(); i++) {
20136	7	50	const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form;
		0
20137	7	50	if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i))
		0
		50
20138	0		i = s.multiword_tokens[j++].id_last;
20139
20140			s.comments.back().append(tok.form);
20141	7	100	if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' ');
		100
		100
20142			}
20143
20144			return true;
20145			}
20146
20147			// Save unused text parts.
20148	1	50	if (text.len) {
20149	0		saved_spaces.append(text.str, text.len);
20150	0		text.str += text.len;
20151	2		text.len = 0;
20152			}
20153
20154			return false;
20155			}
20156
20157			/////////
20158			// File: tokenizer/multiword_splitter.cpp
20159			/////////
20160
20161			// This file is part of UDPipe .
20162			//
20163			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
20164			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20165			//
20166			// This Source Code Form is subject to the terms of the Mozilla Public
20167			// License, v. 2.0. If a copy of the MPL was not distributed with this
20168			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20169
20170	7		void multiword_splitter::append_token(string_piece token, string_piece misc, sentence& s) const {
20171			using namespace unilib;
20172
20173			// Buffer
20174			s.add_word();
20175	7		string& buffer = s.words.back().form;
20176
20177			// Lowercase the token
20178	7		utf8::map(unicode::lowercase, token.str, token.len, buffer);
20179			reverse(buffer.begin(), buffer.end());
20180
20181			// Try finding lowercased version in the full_rules
20182			size_t prefix_len = 0;
20183			auto it = full_rules.find(buffer);
20184
20185	7	50	if (it == full_rules.end()) {
20186	7	50	if (version >= 2) {
20187	0		string& suffix = s.words.back().misc;
20188			// Try searching suffix_rules if needed
20189	0	0	while (suffix.size() + 1 < buffer.size()) {
20190	0		suffix.push_back(buffer[suffix.size()]);
20191
20192			auto suffix_it = suffix_rules.find(suffix);
20193	0	0	if (suffix_it == suffix_rules.end())
20194			break;
20195
20196	0	0	if (!suffix_it->second.words.empty()) {
20197			it = suffix_it;
20198	0		prefix_len = buffer.size() - suffix.size();
20199			}
20200			}
20201			suffix.clear();
20202			}
20203
20204	7	50	if (!prefix_len) {
20205			// No match
20206	14		s.words.back().form.assign(token.str, token.len);
20207	7	100	if (misc.len) s.words.back().misc.assign(misc.str, misc.len);
20208			return;
20209			}
20210			}
20211
20212			// Determine casing
20213			enum { UC_FIRST, UC_ALL, UC_OTHER }; int casing = UC_OTHER;
20214
20215	0	0	if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) {
20216			casing = UC_ALL;
20217	0	0	for (auto&& chr : utf8::decoder(token.str, token.len))
20218	0	0	if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; }
20219			}
20220
20221			// Fill the multiword token
20222	0		s.multiword_tokens.emplace_back(s.words.back().id, s.words.back().id + (int)it->second.words.size() - 1, token, misc);
20223
20224			s.words.back().form.clear();
20225	0	0	if (prefix_len) {
20226			// Note that prefix_len is measured in byte length of lowercased characters
20227	0		string_piece suffix(token);
20228	0	0	while (s.words.back().form.size() < prefix_len && suffix.len)
		0
		0
20229	0		utf8::append(s.words.back().form, unicode::lowercase(utf8::decode(suffix.str, suffix.len)));
20230	0		s.words.back().form.assign(token.str, token.len - suffix.len);
20231			}
20232	0	0	for (auto&& chr : utf8::decoder(it->second.words[0]))
20233	0	0	utf8::append(s.words.back().form, casing == UC_ALL \|\| (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr);
		0
		0
20234
20235	0	0	for (size_t i = 1; i < it->second.words.size(); i++)
20236	0	0	if (casing != UC_ALL) {
20237			s.add_word(it->second.words[i]);
20238			} else {
20239			s.add_word();
20240	0		utf8::map(unicode::uppercase, it->second.words[i], s.words.back().form);
20241			}
20242			}
20243
20244	1		multiword_splitter* multiword_splitter::load(istream& is) {
20245			char version;
20246	1	50	if (!is.get(version)) return nullptr;
20247	1	50	if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr;
20248
20249			binary_decoder data;
20250	1	50	if (!compressor::load(is, data)) return nullptr;
		50
20251
20252	1	50	unique_ptr splitter(new multiword_splitter(version));
20253			try {
20254	1	50	for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) {
		50
20255			string full_rule;
20256	0	0	data.next_str(full_rule);
20257			reverse(full_rule.begin(), full_rule.end());
20258
20259			// Add the full_rule and its words
20260			auto& info = splitter->full_rules[full_rule];
20261	0	0	for (unsigned words = data.next_1B(); words; words--) {
		0
20262	0	0	info.words.emplace_back();
20263	0	0	data.next_str(info.words.back());
20264			}
20265	0	0	if (info.words.empty()) return nullptr;
20266			}
20267
20268	1	50	if (version >= 2)
20269	0	0	for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) {
		0
20270			string suffix_rule;
20271	0	0	data.next_str(suffix_rule);
20272			reverse(suffix_rule.begin(), suffix_rule.end());
20273
20274			// Add the suffix_rule and its words
20275			auto& info = splitter->suffix_rules[suffix_rule];
20276	0	0	for (unsigned words = data.next_1B(); words; words--) {
		0
20277	0	0	info.words.emplace_back();
20278	0	0	data.next_str(info.words.back());
20279			}
20280	0	0	if (info.words.empty()) return nullptr;
20281
20282			// Add prefixes of the suffix with empty data
20283	0	0	if (!suffix_rule.empty())
20284	0	0	for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back())
20285			splitter->suffix_rules[suffix_rule];
20286		0	}
20287			} catch (binary_decoder_error&) {
20288			return nullptr;
20289			}
20290
20291	1	50	return data.is_end() ? splitter.release() : nullptr;
20292			}
20293
20294			/////////
20295			// File: tokenizer/multiword_splitter_trainer.h
20296			/////////
20297
20298			// This file is part of UDPipe .
20299			//
20300			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
20301			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20302			//
20303			// This Source Code Form is subject to the terms of the Mozilla Public
20304			// License, v. 2.0. If a copy of the MPL was not distributed with this
20305			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20306
20307			class multiword_splitter_trainer {
20308			public:
20309			static bool train(const vector& data, ostream& os, string& error);
20310			};
20311
20312			/////////
20313			// File: tokenizer/multiword_splitter_trainer.cpp
20314			/////////
20315
20316			// This file is part of UDPipe .
20317			//
20318			// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of
20319			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20320			//
20321			// This Source Code Form is subject to the terms of the Mozilla Public
20322			// License, v. 2.0. If a copy of the MPL was not distributed with this
20323			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20324
20325	0		bool multiword_splitter_trainer::train(const vector& data, ostream& os, string& error) {
20326			using namespace unilib;
20327			error.clear();
20328
20329			// Train
20330	0		struct rule_info {
20331			vector words;
20332			unsigned count = 0;
20333			};
20334			map full_rules, suffix_rules;
20335
20336			// Full rules
20337			string lc_form;
20338	0		vector lc_words;
20339	0	0	for (auto&& sentence : data)
20340	0	0	for (auto&& multiword : sentence.multiword_tokens) {
20341			utf8::map(unicode::lowercase, multiword.form, lc_form);
20342	0		lc_words.clear();
20343	0	0	for (int i = multiword.id_first; i <= multiword.id_last; i++)
20344	0	0	utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back()));
20345
20346	0	0	auto& info = full_rules[lc_form];
20347	0	0	if (info.words.empty())
20348	0		info.words.assign(lc_words.begin(), lc_words.end());
20349	0		info.count += lc_words == info.words;
20350	0	0	if (!info.count) full_rules.erase(lc_form);
20351			}
20352
20353			// Remove the full rules which trigger too negatively
20354	0	0	for (auto&& sentence : data)
20355	0	0	for (size_t i = 1, j = 0; i < sentence.words.size(); i++) {
20356	0	0	if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
		0
		0
20357	0		i = sentence.multiword_tokens[j++].id_last;
20358			continue;
20359			}
20360
20361			utf8::map(unicode::lowercase, sentence.words[i].form, lc_form);
20362			auto it = full_rules.find(lc_form);
20363	0	0	if (it != full_rules.end())
20364	0	0	if (!--it->second.count)
20365			full_rules.erase(it);
20366			}
20367
20368			// Suffix rules
20369	0	0	for (auto&& full_rule : full_rules) {
20370			size_t prefix_match = 0;
20371	0	0	while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++;
		0
		0
20372	0	0	for (; prefix_match; prefix_match--)
20373	0	0	if (((unsigned char)full_rule.first[prefix_match]) < 0x80 \|\| ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) {
		0
		0
20374	0	0	lc_form.assign(full_rule.first, prefix_match, string::npos);
20375			lc_words.assign(full_rule.second.words.begin(), full_rule.second.words.end());
20376	0	0	lc_words[0].erase(0, prefix_match);
20377
20378	0	0	auto& info = suffix_rules[lc_form];
20379	0	0	if (info.words.empty())
20380	0		info.words.assign(lc_words.begin(), lc_words.end());
20381	0		info.count += lc_words == info.words;
20382	0	0	if (!info.count) suffix_rules.erase(lc_form);
20383			}
20384			}
20385
20386			// Remove the suffix rules which trigger too negatively
20387	0	0	for (auto&& sentence : data)
20388	0	0	for (size_t i = 1, j = 0; i < sentence.words.size(); i++) {
20389	0	0	if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) {
		0
		0
20390	0		i = sentence.multiword_tokens[j++].id_last;
20391	0		continue;
20392			}
20393
20394			utf8::map(unicode::lowercase, sentence.words[i].form, lc_form);
20395	0	0	while (lc_form.size() > 1) {
20396	0	0	lc_form.erase(0, 1);
20397			auto it = suffix_rules.find(lc_form);
20398	0	0	if (it != suffix_rules.end()) {
20399	0	0	if (it->second.count <= 10)
20400			suffix_rules.erase(it);
20401			else
20402	0		it->second.count -= 10;
20403			}
20404			}
20405			}
20406
20407			// Encode
20408	0	0	binary_encoder enc;
20409	0		enc.add_4B(full_rules.size());
20410	0	0	for (auto&& full_rule : full_rules) {
20411	0	0	enc.add_str(full_rule.first);
20412	0	0	enc.add_1B(full_rule.second.words.size());
20413	0	0	for (auto& word : full_rule.second.words)
20414	0	0	enc.add_str(word);
20415			}
20416	0		enc.add_4B(suffix_rules.size());
20417	0	0	for (auto&& suffix_rule : suffix_rules) {
20418	0	0	enc.add_str(suffix_rule.first);
20419	0	0	enc.add_1B(suffix_rule.second.words.size());
20420	0	0	for (auto& word : suffix_rule.second.words)
20421	0	0	enc.add_str(word);
20422			}
20423
20424			// Save
20425	0	0	os.put(multiword_splitter::VERSION_LATEST);
20426	0	0	if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false;
		0
		0
20427
20428			return true;
20429			}
20430
20431			/////////
20432			// File: trainer/trainer.h
20433			/////////
20434
20435			// This file is part of UDPipe .
20436			//
20437			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20438			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20439			//
20440			// This Source Code Form is subject to the terms of the Mozilla Public
20441			// License, v. 2.0. If a copy of the MPL was not distributed with this
20442			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20443
20444			class trainer {
20445			public:
20446			static bool train(const string& method, const vector& train, const vector& heldout,
20447			const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error);
20448
20449			static const string DEFAULT;
20450			static const string NONE;
20451
20452			protected:
20453			static unsigned hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum);
20454			static double hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum);
20455			static double hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum);
20456
20457			private:
20458			static double rnd(unsigned run, unsigned index);
20459			};
20460
20461			/////////
20462			// File: trainer/trainer_morphodita_parsito.h
20463			/////////
20464
20465			// This file is part of UDPipe .
20466			//
20467			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20468			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20469			//
20470			// This Source Code Form is subject to the terms of the Mozilla Public
20471			// License, v. 2.0. If a copy of the MPL was not distributed with this
20472			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20473
20474			class trainer_morphodita_parsito : public trainer {
20475			public:
20476			static bool train(const vector& training, const vector& heldout,
20477			const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error);
20478
20479			private:
20480			static bool train_tokenizer(const vector& training, const vector& heldout,
20481			const string& options, ostream& os, string& error);
20482			static bool train_tagger(const vector& training, const vector& heldout,
20483			const string& options, ostream& os, string& error);
20484			static bool train_parser(const vector& training, const vector& heldout,
20485			const string& options, const string& tagger_model, ostream& os, string& error);
20486
20487			// Generic model methods
20488			enum model_type { TOKENIZER_MODEL, TAGGER_MODEL, PARSER_MODEL };
20489			static bool load_model(const string& data, model_type model, string_piece& range);
20490			static const string& model_normalize_form(string_piece form, string& output);
20491			static const string& model_normalize_lemma(string_piece lemma, string& output);
20492			static void model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word);
20493
20494			// Tagger-specific model methods
20495			static bool train_tagger_model(const vector& training, const vector& heldout,
20496			unsigned model, unsigned models, const named_values::map& tagger, ostream& os, string& error);
20497			static bool can_combine_tag(const word& w, string& error);
20498			static const string& combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag);
20499			static const string& most_frequent_tag(const vector& data, const string& upostag, bool xpostag, bool feats, string& combined_tag);
20500			static const string& combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set& flat_lemmas = unordered_set());
20501
20502			// Generic options handling
20503			static const string& option_str(const named_values::map& options, const string& name, int model = -1);
20504			static bool option_int(const named_values::map& options, const string& name, int& value, string& error, int model = -1);
20505			static bool option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model = -1);
20506			static bool option_double(const named_values::map& options, const string& name, double& value, string& error, int model = -1);
20507
20508			// Various string data
20509			static const string empty_string;
20510			static const string tag_separators;
20511			static const string tagger_features_tagger;
20512			static const string tagger_features_lemmatizer;
20513			static const string parser_nodes;
20514			};
20515
20516			/////////
20517			// File: trainer/trainer.cpp
20518			/////////
20519
20520			// This file is part of UDPipe .
20521			//
20522			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20523			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20524			//
20525			// This Source Code Form is subject to the terms of the Mozilla Public
20526			// License, v. 2.0. If a copy of the MPL was not distributed with this
20527			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20528
20529	2		const string trainer::DEFAULT;
20530	2		const string trainer::NONE = "none";
20531
20532	0		bool trainer::train(const string& method, const vector& training, const vector& heldout,
20533			const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) {
20534			error.clear();
20535
20536	0	0	stringstream os_buffer;
20537	0	0	os_buffer.put(method.size());
20538	0	0	os_buffer.write(method.c_str(), method.size());
20539
20540			try {
20541	0	0	if (method == "morphodita_parsito") {
20542	0	0	if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error))
		0
20543			return false;
20544			} else {
20545	0	0	error.assign("Unknown UDPipe method '").append(method).append("'!");
		0
20546			return false;
20547		0	}
		0
20548			} catch (training_error& e) {
20549			error.assign(e.what());
20550			return false;
20551			}
20552
20553	0	0	os << os_buffer.rdbuf();
20554			return true;
20555			}
20556
20557	0		unsigned trainer::hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum) {
20558	0		return minimum + int((maximum - minimum + 1) * rnd(run, index));
20559			}
20560
20561	0		double trainer::hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum) {
20562	0		return minimum + (maximum - minimum) * rnd(run, index);
20563			}
20564
20565	0		double trainer::hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum) {
20566	0		return exp(log(minimum) + (log(maximum) - log(minimum)) * rnd(run, index));
20567			}
20568
20569	0		double trainer::rnd(unsigned run, unsigned index) {
20570			uint32_t state = 12345U;
20571	0	0	for (unsigned i = 0; i < 10; i++)
		0
		0
		0
		0
		0
		0
		0
		0
		0
20572	0		state = state * 1103515245U + run * 19999999U + index * 1000000007U + 12345U;
20573	0		return (state >> 16) / double(1<<16);
20574			}
20575
20576			/////////
20577			// File: morphodita/tagger/elementary_features_encoder.h
20578			/////////
20579
20580			// This file is part of MorphoDiTa .
20581			//
20582			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20583			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20584			//
20585			// This Source Code Form is subject to the terms of the Mozilla Public
20586			// License, v. 2.0. If a copy of the MPL was not distributed with this
20587			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20588
20589			namespace morphodita {
20590
20591			template
20592	0		inline bool elementary_features::save(ostream& os) {
20593	0		binary_encoder enc;
20594
20595	0	0	enc.add_1B(maps.size());
20596	0	0	for (auto&& map : maps)
20597	0	0	map.save(enc);
20598
20599	0	0	return compressor::save(os, enc);
20600			}
20601
20602			} // namespace morphodita
20603
20604			/////////
20605			// File: morphodita/tagger/feature_sequences_encoder.h
20606			/////////
20607
20608			// This file is part of MorphoDiTa .
20609			//
20610			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20611			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20612			//
20613			// This Source Code Form is subject to the terms of the Mozilla Public
20614			// License, v. 2.0. If a copy of the MPL was not distributed with this
20615			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20616
20617			namespace morphodita {
20618
20619			template
20620	0		void feature_sequences::parse(int window_size, istream& is) {
20621			unordered_map elementary_map;
20622	0	0	for (auto&& description : ElementaryFeatures::descriptions)
20623	0	0	if (!elementary_map.emplace(description.name, description).second)
20624	0	0	training_failure("Repeated elementary feature with name " << description.name << '!');
20625
20626			string line;
20627	0		vector tokens;
20628	0	0	while (getline(is, line)) {
		0
20629	0	0	split(line, ',', tokens);
20630	0	0	if (tokens.empty()) training_failure("Feature sequence cannot be empty!");
		0
		0
20631
20632			bool contains_only_current = false;
20633	0	0	sequences.emplace_back();
20634	0	0	for (auto&& token : tokens) {
20635	0		vector parts;
20636	0	0	split(token, ' ', parts);
20637	0	0	if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!");
		0
		0
20638			auto it = elementary_map.find(parts[0]);
20639	0	0	if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!");
		0
		0
20640
20641			auto& desc = it->second;
20642	0	0	int sequence_index = parse_int(parts[1].c_str(), "sequence_index");
20643	0	0	if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!");
		0
		0
		0
		0
20644	0	0	if (desc.type == PER_TAG && (sequence_index > 0 \|\| sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!");
		0
		0
		0
		0
		0
20645	0	0	if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!");
		0
		0
		0
		0
20646
20647	0	0	sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index);
20648	0	0	if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1);
20649	0	0	if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index);
20650	0		contains_only_current \|= desc.range == ONLY_CURRENT;
20651			}
20652	0	0	if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!");
		0
		0
		0
		0
20653			}
20654
20655	0		stable_sort(sequences.begin(), sequences.end(), [](const feature_sequence& a, const feature_sequence& b) { return a.dependant_range > b.dependant_range; });
20656	0	0	scores.resize(sequences.size());
20657	0		}
20658
20659			template
20660	0		inline bool feature_sequences::save(ostream& os) {
20661	0	0	if (!elementary.save(os)) return false;
20662
20663	0		binary_encoder enc;
20664	0	0	enc.add_1B(sequences.size());
20665	0	0	for (auto&& sequence : sequences) {
20666	0		enc.add_4B(sequence.dependant_range);
20667	0	0	enc.add_1B(sequence.elements.size());
20668	0	0	for (auto&& element : sequence.elements) {
20669	0		enc.add_4B(element.type);
20670	0		enc.add_4B(element.elementary_index);
20671	0		enc.add_4B(element.sequence_index);
20672			}
20673			}
20674
20675	0	0	enc.add_1B(scores.size());
20676	0	0	for (auto&& score : scores)
20677	0	0	score.save(enc);
20678
20679	0	0	return compressor::save(os, enc);
20680			}
20681
20682			} // namespace morphodita
20683
20684			/////////
20685			// File: morphodita/tagger/training_maps.h
20686			/////////
20687
20688			// This file is part of MorphoDiTa .
20689			//
20690			// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of
20691			// Mathematics and Physics, Charles University in Prague, Czech Republic.
20692			//
20693			// This Source Code Form is subject to the terms of the Mozilla Public
20694			// License, v. 2.0. If a copy of the MPL was not distributed with this
20695			// file, You can obtain one at http://mozilla.org/MPL/2.0/.
20696
20697			namespace morphodita {
20698
20699			// Declarations
20700	0	0	class training_elementary_feature_map {
		0
20701			public:
20702			inline elementary_feature_value value(const char* feature, int len) const;
20703			mutable unordered_map map = {{"", elementary_feature_empty}};
20704			private:
20705			mutable string key;
20706			};
20707
20708	0		class training_feature_sequence_map {
20709			public:
20710			struct info {
20711			// We deliberately use feature_sequences_score to check for overflow
20712			feature_sequences_score alpha = 0;
20713			feature_sequences_score gamma = 0;
20714			int last_gamma_update = 0;
20715			};
20716
20717			inline feature_sequence_score score(const char* feature, int len) const;
20718			mutable unordered_map map;
20719			private:
20720			mutable string key;
20721			};
20722
20723			template