File Coverage

blib/lib/LaTeX/TOM.pm

Criterion	Covered	Total	%
statement	11	11	100.0
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod	1	1	100.0
total	16	16	100.0

line	stmt	sub	pod	time	code
1					###############################################################################
2					#
3					# LaTeX::TOM (TeX Object Model)
4					#
5					# Version 1.05
6					#
7					# ----------------------------------------------------------------------------
8					#
9					# originally written by Aaron Krowne (akrowne@vt.edu)
10					# July 2002
11					#
12					# Virginia Polytechnic Institute and State University
13					# Department of Computer Science
14					# Digital Libraries Research Laboratory
15					#
16					# now maintained by Steven Schubiger (schubiger@cpan.org)
17					# April 2008
18					#
19					# ----------------------------------------------------------------------------
20					#
21					# This module provides some decent semantic handling of LaTeX documents. It is
22					# inspired by XML::DOM, so users of that module should be able to acclimate
23					# themselves to this one quickly. Basically the subroutines in this package
24					# allow you to parse a LaTeX document into its logical structure, including
25					# groupings, commands, environments, and comments. These all go into a tree
26					# which is built as arrays of Perl hashes.
27					#
28					###############################################################################
29
30					package LaTeX::TOM;
31
32	10	10		85050	use strict;
	10			73
	10			307
33	10	10		54	use base qw(LaTeX::TOM::Parser);
	10			16
	10			5988
34	10	10		79	use constant true => 1;
	10			23
	10			4880
35
36					our $VERSION = '1.05';
37
38					our (%INNERCMDS, %MATHENVS, %MATHBRACKETS,
39					%BRACELESS, %TEXTENVS, $PARSE_ERRORS_FATAL,
40					$DEBUG);
41
42					# BEGIN CONFIG SECTION ########################################################
43
44					# these are commands that can be "embedded" within a grouping to alter the
45					# environment of that grouping. For instance {\bf text}. Without listing the
46					# command names here, the parser will treat such sequences as plain text.
47					#
48					%INNERCMDS = map { $_ => true } (
49					'bf',
50					'md',
51					'em',
52					'up',
53					'sl',
54					'sc',
55					'sf',
56					'rm',
57					'it',
58					'tt',
59					'noindent',
60					'mathtt',
61					'mathbf',
62					'tiny',
63					'scriptsize',
64					'footnotesize',
65					'small',
66					'normalsize',
67					'large',
68					'Large',
69					'LARGE',
70					'huge',
71					'Huge',
72					'HUGE',
73					);
74
75					# these commands put their environments into math mode
76					#
77					%MATHENVS = map { $_ => true } (
78					'align',
79					'equation',
80					'eqnarray',
81					'displaymath',
82					'ensuremath',
83					'math',
84					'$$',
85					'$',
86					'\[',
87					'\(',
88					);
89
90					# these commands/environments put their children in text (non-math) mode
91					#
92					%TEXTENVS = map { $_ => true } (
93					'tiny',
94					'scriptsize',
95					'footnotesize',
96					'small',
97					'normalsize',
98					'large',
99					'Large',
100					'LARGE',
101					'huge',
102					'Huge',
103					'HUGE',
104					'text',
105					'textbf',
106					'textmd',
107					'textsc',
108					'textsf',
109					'textrm',
110					'textsl',
111					'textup',
112					'texttt',
113					'mbox',
114					'fbox',
115					'section',
116					'subsection',
117					'subsubsection',
118					'em',
119					'bf',
120					'emph',
121					'it',
122					'enumerate',
123					'description',
124					'itemize',
125					'trivlist',
126					'list',
127					'proof',
128					'theorem',
129					'lemma',
130					'thm',
131					'prop',
132					'lem',
133					'table',
134					'tabular',
135					'tabbing',
136					'caption',
137					'footnote',
138					'center',
139					'flushright',
140					'document',
141					'article',
142					'titlepage',
143					'title',
144					'author',
145					'titlerunninghead',
146					'authorrunninghead',
147					'affil',
148					'email',
149					'abstract',
150					'thanks',
151					'algorithm',
152					'nonumalgorithm',
153					'references',
154					'thebibliography',
155					'bibitem',
156					'verbatim',
157					'verbatimtab',
158					'quotation',
159					'quote',
160					);
161
162					# these form sets of simple mode delimiters
163					#
164					%MATHBRACKETS = (
165					'$$' => '$$',
166					'$' => '$',
167					# '\[' => '\]', # these are problematic and handled separately now
168					# '$' => '$',
169					);
170
171					# these commands require no braces, and their parameters are simply the
172					# "word" following the command declaration
173					#
174					%BRACELESS = map { $_ => true } (
175					'oddsidemargin',
176					'evensidemargin',
177					'topmargin',
178					'headheight',
179					'headsep',
180					'textwidth',
181					'textheight',
182					'input',
183					);
184
185					# default value controlling how fatal parse errors are
186					#
187					# 0 = warn, 1 = die, 2 = silent
188					#
189					$PARSE_ERRORS_FATAL = 0;
190
191					# debugging mode (internal use)
192					#
193					# 0 = off, 1 = messages, 2 = messages and code
194					#
195					$DEBUG = 0;
196
197					# END CONFIG SECTION ##########################################################
198
199					sub new {
200	9	9	1	1161	my $class = shift;
201
202	9			71	return LaTeX::TOM::Parser->_new(@_);
203					}
204
205					1;
206
207					=head1 NAME
208
209					LaTeX::TOM - A module for parsing, analyzing, and manipulating LaTeX documents.
210
211					=head1 SYNOPSIS
212
213					use LaTeX::TOM;
214
215					$parser = LaTeX::TOM->new;
216
217					$document = $parser->parseFile('mypaper.tex');
218
219					$latex = $document->toLaTeX;
220
221					$specialnodes = $document->getNodesByCondition(sub {
222					my $node = shift;
223					return (
224					$node->getNodeType eq 'TEXT'
225					&& $node->getNodeText =~ /magic string/
226					);
227					});
228
229					$sections = $document->getNodesByCondition(sub {
230					my $node = shift;
231					return (
232					$node->getNodeType eq 'COMMAND'
233					&& $node->getCommandName =~ /section$/
234					);
235					});
236
237					$indexme = $document->getIndexableText;
238
239					$document->print;
240
241					=head1 DESCRIPTION
242
243					This module provides a parser which parses and interprets (though not fully)
244					LaTeX documents and returns a tree-based representation of what it finds.
245					This tree is a C. The tree contains C nodes.
246
247					This module should be especially useful to anyone who wants to do processing
248					of LaTeX documents that requires extraction of plain-text information, or
249					altering of the plain-text components (or alternatively, the math-text
250					components).
251
252					=head1 COMPONENTS
253
254					=head2 LaTeX::TOM::Parser
255
256					The parser recognizes 3 parameters upon creation by C<< LaTeX::TOM->new >>.
257					The parameters, in order, are
258
259					=over 4
260
261					=item parse error handling (= B<0> \|\| 1 \|\| 2)
262
263					Determines what happens when a parse error is encountered. C<0> results in a
264					warning. C<1> results in a die. C<2> results in silence. Note that particular
265					groupings in LaTeX (i.e. newcommands and the like) contain invalid TeX or
266					LaTeX, so you nearly always need this parameter to be C<0> or C<2> to completely
267					parse the document.
268
269					=item read inputs flag (= 0 \|\| B<1>)
270
271					This flag determines whether a scan for C<\input> and C<\input-like> commands is
272					performed, and the resulting called files parsed and added to the parent
273					parse tree. C<0> means no, C<1> means do it. Note that this will happen recursively
274					if it is turned on. Also, bibliographies (F<.bbl> files) are detected and
275					included.
276
277					=item apply mappings flag (= 0 \|\| B<1>)
278
279					This flag determines whether (most) user-defined mappings are applied. This
280					means C<\defs>, C<\newcommands>, and C<\newenvironments>. This is critical for
281					properly analyzing the content of the document, as this must be phrased in terms
282					of the semantics of the original TeX and LaTeX commands, not ad hoc user macros.
283					So, for instance, do not expect plain-text extraction to work properly with this
284					option off.
285
286					=back
287
288					The parser returns a C ($document in the SYNOPSIS).
289
290					=head2 LaTeX::TOM::Node
291
292					Nodes may be of the following types:
293
294					=over 4
295
296					=item TEXT
297
298					C nodes can be thought of as representing the plain-text portions of the
299					LaTeX document. This includes math and anything else that is not a recognized
300					TeX or LaTeX command, or user-defined command. In reality, C nodes contain
301					commands that this parser does not yet recognize the semantics of.
302
303					=item COMMAND
304
305					A C node represents a TeX command. It always has child nodes in a tree,
306					though the tree might be empty if the command operates on zero parameters. An
307					example of a command is
308
309					\textbf{blah}
310
311					This would parse into a C node for C, which would have a subtree
312					containing the C node with text ``blah.''
313
314					=item ENVIRONMENT
315
316					Similarly, TeX environments parse into C nodes, which have metadata
317					about the environment, along with a subtree representing what is contained in
318					the environment. For example,
319
320					\begin{equation}
321					r = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}
322					\end{equation}
323
324					Would parse into an C node of the class ``equation'' with a child
325					tree containing the result of parsing C<``r = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}.''>
326
327					=item GROUP
328
329					A C is like an anonymous C. Since you can put whatever you want in
330					curly-braces (C<{}>) in TeX in order to make semantically isolated regions, this
331					separation is preserved by the parser. A C is just the subtree of the
332					parsed contents of plain curly-braces.
333
334					It is important to note that currently only the first C in a series of
335					Cs following a LaTeX command will actually be parsed into a C node.
336					The reason is that, for the initial purposes of this module, it was not
337					necessary to recognize additional Cs as additional parameters to the
338					C. However, this is something that this module really should do
339					eventually. Currently if you want all the parameters to a multi-parametered
340					command, you'll need to pick out all the following C nodes yourself.
341
342					Eventually this will become something like a list which is stored in the
343					C node, much like L's treatment of attributes. These are, in a
344					sense, apart from the rest of the document tree. Then C nodes will become
345					much more rare.
346
347					=item COMMENT
348
349					A C node is very similar to a C node, except it is specifically for
350					lines beginning with C<``%''> (the TeX comment delimiter) or the right-hand
351					portion of a line that has C<``%''> at some internal point.
352
353					=back
354
355					=head2 LaTeX::TOM::Trees
356
357					As mentioned before, the Tree is the return result of a parse.
358
359					The tree is nothing more than an arrayref of Nodes, some of which may contain
360					their own trees. This is useful knowledge at this point, since the user isn't
361					provided with a full suite of convenient tree-modification methods. However,
362					Trees do already have some very convenient methods, described in the next
363					section.
364
365					=head1 METHODS
366
367					=head2 LaTeX::TOM
368
369					=head3 new
370
371					=over 4
372
373					=item C<>
374
375					Instantiate a new parser object.
376
377					=back
378
379					In this section all of the methods for each of the components are listed and
380					described.
381
382					=head2 LaTeX::TOM::Parser
383
384					The methods for the parser are:
385
386					=head3 parseFile (filename)
387
388					=over 4
389
390					=item C<>
391
392					Read in the contents of I and parse them, returning a C.
393
394					=back
395
396					=head3 parse (string)
397
398					=over 4
399
400					=item C<>
401
402					Parse the string I and return a C.
403
404					=back
405
406					=head2 LaTeX::TOM::Tree
407
408					This section contains methods for the Trees returned by the parser.
409
410					=head3 copy
411
412					=over 4
413
414					=item C<>
415
416					Duplicate a tree into new memory.
417
418					=back
419
420					=head3 print
421
422					=over 4
423
424					=item C<>
425
426					A debug print of the structure of the tree.
427
428					=back
429
430					=head3 plainText
431
432					=over 4
433
434					=item C<>
435
436					Returns an arrayref which is a list of strings representing the text of all
437					C C nodes, in an inorder traversal.
438
439					=back
440
441					=head3 indexableText
442
443					=over 4
444
445					=item C<>
446
447					A method like the above but which goes one step further; it cleans all of the
448					returned text and concatenates it into a single string which one could consider
449					having all of the standard information retrieval value for the document,
450					making it useful for indexing.
451
452					=back
453
454					=head3 toLaTeX
455
456					=over 4
457
458					=item C<>
459
460					Return a string representing the LaTeX encoded by the tree. This is especially
461					useful to get a normal document again, after modifying nodes of the tree.
462
463					=back
464
465					=head3 getTopLevelNodes
466
467					=over 4
468
469					=item C<>
470
471					Return a list of C at the top level of the Tree.
472
473					=back
474
475					=head3 getAllNodes
476
477					=over 4
478
479					=item C<>
480
481					Return an arrayref with B nodes of the tree. This "flattens" the tree.
482
483					=back
484
485					=head3 getCommandNodesByName (name)
486
487					=over 4
488
489					=item C<>
490
491					Return an arrayref with all C nodes in the tree which have a name
492					matching I.
493
494					=back
495
496					=head3 getEnvironmentsByName (name)
497
498					=over 4
499
500					=item C<>
501
502					Return an arrayref with all C nodes in the tree which have a class
503					matching I.
504
505					=back
506
507					=head3 getNodesByCondition (code reference)
508
509					=over 4
510
511					=item C<>
512
513					This is a catch-all search method which can be used to pull out nodes that
514					match pretty much any perl expression, without manually having to traverse the
515					tree. I `is a perl code reference which receives as its first`
516					argument the node of the tree that is currently scrutinized and is expected to
517					return a boolean value. See the SYNOPSIS for examples.
518
519					=back
520
521					=head3 getFirstNode
522
523					=over 4
524
525					=item C<>
526
527					Returns the first node of the tree. This is useful if you want to walk the tree
528					yourself, starting with the first node.
529
530					=back
531
532					=head2 LaTeX::TOM::Node
533
534					This section contains the methods for nodes of the parsed Trees.
535
536					=head3 getNodeType
537
538					=over 4
539
540					=item C<>
541
542					Returns the type, one of C, C, C, C, or C,
543					as described above.
544
545					=back
546
547					=head3 getNodeText
548
549					=over 4
550
551					=item C<>
552
553					Applicable for C or C nodes; this returns the document text they contain.
554					This is undef for other node types.
555
556					=back
557
558					=head3 setNodeText
559
560					=over 4
561
562					=item C<>
563
564					Set the node text, also for C and C nodes.
565
566					=back
567
568					=head3 getNodeStartingPosition
569
570					=over 4
571
572					=item C<>
573
574					Get the starting character position in the document of this node. For C
575					and C nodes, this will be where the text begins. For C,
576					C, or C nodes, this will be the position of the I character of
577					the opening identifier.
578
579					=back
580
581					=head3 getNodeEndingPosition
582
583					=over 4
584
585					=item C<>
586
587					Same as above, but for last character. For C, C, or C
588					nodes, this will be the I character of the closing identifier.
589
590					=back
591
592					=head3 getNodeOuterStartingPosition
593
594					=over 4
595
596					=item C<>
597
598					Same as getNodeStartingPosition, but for C, C, or C nodes,
599					this returns the I character of the opening identifier.
600
601					=back
602
603					=head3 getNodeOuterEndingPosition
604
605					=over 4
606
607					=item C<>
608
609					Same as getNodeEndingPosition, but for C, C, or C nodes,
610					this returns the I character of the closing identifier.
611
612					=back
613
614					=head3 getNodeMathFlag
615
616					=over 4
617
618					=item C<>
619
620					This applies to any node type. It is C<1> if the node sets, or is contained
621					within, a math mode region. C<0> otherwise. C nodes which have this flag as C<1>
622					can be assumed to be the actual mathematics contained in the document.
623
624					=back
625
626					=head3 getNodePlainTextFlag
627
628					=over 4
629
630					=item C<>
631
632					This applies only to C nodes. It is C<1> if the node is non-math B is
633					visible (in other words, will end up being a part of the output document). One
634					would only want to index C nodes with this property, for information
635					retrieval purposes.
636
637					=back
638
639					=head3 getEnvironmentClass
640
641					=over 4
642
643					=item C<>
644
645					This applies only to C nodes. Returns what class of environment the
646					node represents (the C in C<\begin{X}> and C<\end{X}>).
647
648					=back
649
650					=head3 getCommandName
651
652					=over 4
653
654					=item C<>
655
656					This applies only to C nodes. Returns the name of the command (the C in
657					C<\X{...}>).
658
659					=back
660
661					=head3 getChildTree
662
663					=over 4
664
665					=item C<>
666
667					This applies only to C, C, and C nodes: it returns the
668					C which is ``under'' the calling node.
669
670					=back
671
672					=head3 getFirstChild
673
674					=over 4
675
676					=item C<>
677
678					This applies only to C, C, and C nodes: it returns the
679					first node from the first level of the child subtree.
680
681					=back
682
683					=head3 getLastChild
684
685					=over 4
686
687					=item C<>
688
689					Same as above, but for the last node of the first level.
690
691					=back
692
693					=head3 getPreviousSibling
694
695					=over 4
696
697					=item C<>
698
699					Return the prior node on the same level of the tree.
700
701					=back
702
703					=head3 getNextSibling
704
705					=over 4
706
707					=item C<>
708
709					Same as above, but for following node.
710
711					=back
712
713					=head3 getParent
714
715					=over 4
716
717					=item C<>
718
719					Get the parent node of this node in the tree.
720
721					=back
722
723					=head3 getNextGroupNode
724
725					=over 4
726
727					=item C<>
728
729					This is an interesting function, and kind of a hack because of the way the
730					parser makes the current tree. Basically it will give you the next sibling
731					that is a C node, until it either hits the end of the tree level, a C
732					node which doesn't match C, or a C node.
733
734					This is useful for finding all Ced parameters after a C node (see
735					comments for C in the C / C section). You
736					can just have a while loop that calls this method until it gets C, and
737					you'll know you've found all the parameters to a command.
738
739					Note: this may be bad, but C Nodes matching C (optional
740					parameter groups) are treated as if they were 'blank'.
741
742					=back
743
744					=head1 CAVEATS
745
746					Due to the lack of tree-modification methods, currently this module is
747					mostly useful for minor modifications to the parsed document, for instance,
748					altering the text of C nodes but not deleting the nodes. Of course, the
749					user can still do this by breaking abstraction and directly modifying the Tree.
750
751					Also note that the parsing is not complete. This module was not written with
752					the intention of being able to produce output documents the way ``latex'' does.
753					The intent was instead to be able to analyze and modify the document on a
754					logical level with regards to the content; it doesn't care about the document
755					formatting and outputting side of TeX/LaTeX.
756
757					There is much work still to be done. See the F list in the F source.
758
759					=head1 BUGS
760
761					Probably plenty. However, this module has performed fairly well on a set of
762					~1000 research publications from the Computing Research Repository, so I
763					deemed it ``good enough'' to use for purposes similar to mine.
764
765					Please let the maintainer know of parser errors if you discover any.
766
767					=head1 CREDITS
768
769					Thanks to (in order of appearance) who have contributed valuable suggestions and patches:
770
771					Otakar Smrz
772					Moritz Lenz
773					James Bowlin
774					Jesse S. Bangs
775					Cord Merrell
776					Debian Perl Group
777					Eli Billauer
778
779					=head1 AUTHORS
780
781					Written by Aaron Krowne
782
783					Maintained by Steven Schubiger
784
785					=head1 LICENSE
786
787					This program is free software; you may redistribute it and/or
788					modify it under the same terms as Perl itself.
789
790					See L
791
792					=cut