File Coverage

blib/lib/GCC/TranslationUnit.pm
Criterion Covered Total %
statement 9 77 11.6
branch 0 40 0.0
condition n/a
subroutine 3 7 42.8
pod n/a
total 12 124 9.6


line stmt bran cond sub pod time code
1             package GCC::TranslationUnit;
2 1     1   43055 use strict;
  1         2  
  1         49  
3              
4             our $VERSION = "1.00";
5              
6 1     1   675 use GCC::Tree;
  1         3  
  1         297  
7              
8             package GCC::TranslationUnit::Parser;
9              
10             # This class parses the GCC translation unit, as dumped by the
11             # -fdump-translation-unit flag in gcc-3.2.2, and sticks it into a Perl
12             # datastructure intended to somewhat mirror the tree structure
13             # documented in gcc/doc/c-tree.texi, from that same version of gcc.
14              
15             # C++ overloaded operator names, as per cp/dump.c in gcc-3.2.2
16             our %ops = (
17             'new' => "new",
18             vecnew => "new[]",
19             'delete' => "delete",
20             vecdelete => "delete[]",
21             'pos' => "+",
22             neg => "-",
23             addr => "&",
24             deref => "*",
25             'not' => "~",
26             lnot => "!",
27             preinc => "++",
28             predec => "--",
29             plus => "+",
30             plusassign => "+=",
31             minus => "-",
32             minusassign => "-=",
33             mult => "*",
34             multassign => "*=",
35             div => "/",
36             divassign => "/=",
37             mod => "%",
38             modassign => "%=",
39             'and' => "&",
40             andassign => "&=",
41             'or' => "|",
42             orassign => "|=",
43             'xor' => "^",
44             xorassign => "^=",
45             lshift => "<<",
46             lshiftassign => "<<=",
47             rshift => ">>",
48             rshiftassign => ">>=",
49             'eq' => "==",
50             'ne' => "!=",
51             'lt' => "<",
52             'gt' => ">",
53             'le' => "<=",
54             'ge' => ">=",
55             land => "&&",
56             lor => "||",
57             compound => ",",
58             memref => "->*",
59             'ref' => "->",
60             subs => "[]",
61             postinc => "++",
62             postdec => "--",
63             call => "()",
64             assign => "="
65             );
66              
67             # op names for use in regex
68             my $opnames = join '|', keys(%ops);
69              
70             # I'm using the standard -fdump-translation-unit format. Anyone is welcome
71             # to provide an XML parser for the various XML format patches to GCC.
72              
73 1     1   1750 use IO::File;
  1         16745  
  1         7724  
74              
75             # My initial parser was regarded as 'unreadable' and 'uncommented' by
76             # some people. That was unfortunate. Here, have a comment:
77              
78             # Usage: $tu = GCC::TranslationUnit::Parser->parsefile("file.c.tu")
79             #
80             # Better yet, save yourself some memory and do:
81             # $node = GCC::TranslationUnit::Parser->parsefile("file.c.tu")->root;
82             #
83             # Remember, $tu is an N-element array, where N is the number of nodes GCC
84             # dumped. Only keep the array if you plan to iterate through every element
85             # in the GCC tree; otherwise, free the memory used by the array, and let
86             # the Perl reference counter free the node branches if you delete them.
87             sub parsefile {
88 0     0     my $class = shift;
89 0           my $file = shift;
90              
91 0           my $fh = new IO::File $file;
92 0 0         return undef unless defined $fh; # Not my problem if I can't open your file
93              
94 0           my $self = bless [], $class;
95 0           my $dump = ""; # the text of a single dumped node
96 0           my $index = 0; # numerical index of the "current" node
97              
98 0           my $line;
99 0           while($line = $fh->getline) {
100 0 0         if($line =~ /^\@(\d+)/) {
101             # The first line of a node should look like:
102             # "@123 some_node ..."
103             # Every other line is requried to be indented
104             #
105             # When we find that "first" line, or EOF, parse the previous node
106 0           $self->parsenode($dump, $self->node($index));
107 0           $self->node($index)->{position} = $index;
108 0           $dump = $line;
109 0           $index = $1;
110             } else {
111 0           $dump .= $line;
112             }
113             }
114 0           $self->parsenode($dump, $self->node($index));
115              
116 0           return $self;
117             }
118              
119             # Returns the root node, from GCC's perspective.
120             # Usage: $tu->root
121 0     0     sub root { shift->[1] }
122              
123             # Take the complete dumped text in $dump from a single GCC node and
124             # stuff it into $node.
125             #
126             # Usage: $tu->parsenode($dump, $tu->node($index));
127             sub parsenode {
128 0     0     my($self, $dump, $node) = @_;
129 0 0         return unless $dump;
130             # print "".("-" x 70) . "\n";
131             # print $dump;
132              
133             # Note that this regex leaves a space before the first key.
134             #
135             # That's because the key matching rule is /\s\w.{3}:\s/
136 0 0         unless($dump =~ s/^\@(\d+)\s+(\w+)(?=\s)//) {
137 0           warn "Unknown node format:\n$dump";
138 0           return;
139             }
140              
141 0           my $index = $1;
142 0           my $type = $2;
143             # print "tu[$1] = $type\n";
144              
145 0           bless $node, "GCC::Node::$type";
146 0           $node->{INDEX} = $index;
147              
148             # First, some examples as to what's possible as a node element
149             #
150             # dump_index() writes:
151             # node: @123
152             #
153             # dump_pointer() writes:
154             # node: 2468ace0
155             #
156             # dump_int() writes:
157             # node: 1234567
158             #
159             # dump_string() writes:
160             # string
161             #
162             # dump_string_field() writes:
163             # node: string
164              
165              
166             # The string nodes can seriously disrupt us, since they aren't quoted.
167             # They are VERY inconvenient, so we must eliminate them first.
168             #
169             # Anything that looks like an identifier_node string must be squashed ASAP
170 0 0         if($dump =~ s/\s+strg:\s(.*)\slngt:\s(\d+)//s) {
171             # identifier_node and string_cst come here, at least
172 0           my($string, $length) = ($1, $2);
173            
174             # string_cst's lngt includes the NUL character, which fprintf()
175             # doesn't print, obviously. Make sure to factor that in...
176 0 0         $length-- if $type eq 'string_cst';
177              
178 0           $node->{'string'} = substr($string, 0, $length);
179 0           $node->{'length'} = $length;
180             # print "string: '$node->{string}'\n";
181             }
182              
183             # The srcp key is BAD. It contains a colon as part of the value, which
184             # could ruin the key parser regex. It's gotta go. Not to mention filenames
185             # with spaces...
186             #
187             # Example:
188             # srcp: file.c:123
189 0 0         $node->{'source'} = $1 if $dump =~ s/\ssrcp:\s(.*?:\d+)(?=\s)//;
190             # print "source: '$node->{source}'\n" if $node->{source};
191            
192             # The remaining nodes are pretty regular and easy to parse.
193             # However, the flags which crop up everywhere disrupt my ability
194             # to determine the end of a value corresponding to a key. for instance:
195             #
196             # key : value protected
197             #
198             # Depending on what the key represents, the protected flag may or
199             # may not be a part of the value. In order to remove the ambiguity,
200             # we have to manually parse out any keys whose values aren't captured
201             # with a trivial /\s(\w.{3}):\s(\S)\s/ match.
202              
203             # First violator:
204             # "qual: %c%c%c "
205 0 0         if($dump =~ s/\squal:\s(.{3})\s//) {
206 0           my $qual = $1;
207 0 0         $node->{'const'} = 1 if $qual =~ /c/;
208 0 0         $node->{'volatile'} = 1 if $qual =~ /v/;
209 0 0         $node->{'restrict'} = 1 if $qual =~ /r/;
210             }
211              
212             # next violator:
213             # base: @1234 virtual public
214             #
215 0           while($dump =~ s/\sbase:\s\@(\d+)\s+(.*?)\s*(public|private|protected)//) {
216             # base is the only key which can appear multiple times in the same
217             # node, since it's spit out from an array. We need to put it back.
218             #
219             # The other vector nodes have a conveniently unique number, like 'op 0'
220 0           my $classid = $1;
221 0 0         my $virtual = $2 ? 1 : 0; # yes, yes, !!$2. vim don't like it, though
222 0           my $access = $3;
223 0           my $base = {
224             class => $self->node($classid),
225             virtual => $virtual,
226             access => $access
227             };
228             # print "base: $virtual $access $classid\n";
229 0           push @{ $node->{'base'} }, $base;
  0            
230             }
231              
232             # Some GCC developer forgot to read the -fdump spec before dipping his
233             # fingers into gcc/cp/dump.c.
234             #
235             # Remember: 4 character max per key!
236 0 0         $node->{raises} = $self->node($1) if $dump =~ s/\sraises: \@(\d+)//;
237            
238             # At this point, we assume all the remaining key/value pairs match
239             # the following regex.
240 0           while($dump =~ s/\s(\w.{0,3}?)\s*:\s(\S+)//) {
241 0           my($key, $value) = ($1, $2);
242 0 0         $value = $self->node($1) if $value =~ /^\@(\d+)/;
243             # print "'$key': '$value'\n";
244              
245             # If the key looks like it came out of a tree_vec or operand list,
246             # stick it back into an array, to save us the trouble of doing
247             # bounds-checking hash fetching voodoo.
248             #
249             # Although, those tree_vec nodes skip elements when indexed in decimal.
250 0 0         if($key =~ /^\d+$/) {
    0          
251 0           $node->{vector}[$key] = $value;
252             } elsif($key =~ /^op (\d+)$/) {
253 0           $node->{operand}[$1] = $value;
254             } else {
255 0           $node->{$key} = $value;
256             }
257             }
258              
259             # Before we can consider the rest of the data as a sequence of flags,
260             # I need to remove a few special-case flags which can be thought of as
261             # being "intentionally sequential" for whatever reason.
262            
263             # operator is fun. The next flag is GCC's operator "name", for which there
264             # is a mapping to the C operator, above, in the declaration of %ops.
265 0 0         if($dump =~ s/\soperator\s+($opnames)\b//o) {
266 0           $node->{operator} = $1;
267             # print "operator $ops{$node->{operator}}\n";
268             }
269              
270             # Some flags have spaces in them! Parse it as one string.
271 0           while($dump =~ s/\s(global init|global fini|pseudo tmpl)(?=\s)//) {
272             # Honestly, I'd rather s/foo bar/foo_bar/g instead
273             # print "TRUE $1\n";
274 0           $node->{$1} = 1;
275             }
276            
277             # For sanity's sake, lets save the access
278 0 0         $node->{access} = $1 if $dump =~ /\b(public|private|protected)\b/;
279              
280             # All that should remain is flags
281 0           while($dump =~ s/(\w+)//) {
282             # print "TRUE $1\n";
283 0           $node->{$1} = 1;
284             }
285              
286             # For debugging purposes, check for extra characters.
287             # If this warning occurs, it's a bug in the parser. Let me know about it.
288             # First, make sure you didn't run out of diskspace when writing the
289             # -fdump file. That would truncate the file and cause this warning, and
290             # that's not my fault.
291             #
292             # Please include details with any error reports: the version of GCC you
293             # used, your operating system (both kernel and distribution), and the
294             # source of the file which generated the error, if possible.
295 0 0         if($dump =~ /\S/) {
296 0           $dump =~ s/\s+/ /g;
297 0           warn "Unparsed data: $dump\nFrom: $_[1] ";
298             }
299              
300 0           return $node;
301             }
302              
303             # Since the dump format includes forward references, we need to pre-initialize
304             # nodes we haven't parsed yet before we assign them to the nodes which
305             # reference them. Therefore, you must ALWAYS get a node using this function
306             # until the parsefile() routine returns a complete translation unit!
307             #
308             # Usage: $tu->node($id)
309             sub node {
310 0     0     my($self, $id) = @_;
311 0 0         unless($self->[$id]) {
312             # Don't un-block this unless; I like adding debug stuff here
313 0           $self->[$id] = {};
314             }
315 0           return $self->[$id];
316             }
317              
318             # vim:set shiftwidth=4 softtabstop=4:
319             1;
320             __END__