root/trunk/gadfly-1.0.0/gadfly/kjParser.py @ 7

Revision 7, 49.9 kB (checked in by jhorman, 8 years ago)

moving "trunk" into "trunk" directory

Line 
1"""Python for parser interpretation
2
3:Author: Aaron Watters
4:Maintainers: http://gadfly.sf.net/
5:Copyright: Aaron Robert Watters, 1994
6:Id: $Id: kjParser.py,v 1.1 2005/06/05 05:51:05 jhorman Exp $:
7"""
8
9# BUGS:
10# Lexical error handling is not nice
11# Parse error handling is not nice
12#
13# Lex analysis may be slow for big grammars
14# Setting case sensitivity for keywords MUST happen BEFORE
15#   declaration of keywords.
16
17import kjSet
18import string
19import re
20import string
21
22# set this flag for regression testing at each load
23RUNTESTS = 0
24
25# set this flag to enable warning for default reductions
26WARNONDEFAULTS = 0
27
28# some local constants
29TERMFLAG = -1 # FLAG FOR TERMINAL
30NOMATCHFLAG = -2 # FLAG FOR NO MATCH IN FSM
31MOVETOFLAG = -3 # FLAG FOR "SHIFT" IN SN FSM
32REDUCEFLAG = -4 # FLAG FOR REDUCTION IN FSM
33TRANSFLAG = -5 # FLAG FOR TRANSIENT STATE IN FSM
34KEYFLAG = -6 # FLAG FOR KEYWORD
35NONTERMFLAG = -7 # FLAG FOR NONTERMINAL
36TERMFLAG = -8 # FLAG FOR TERMINAL
37EOFFLAG = "*" # FLAG for End of file
38
39# set this string to the Module name (filename)
40# used for dumping reconstructable objects
41THISMODULE = "gadfly.kjParser"
42
43# regular expression for matching whitespace
44WHITERE = "["+string.whitespace+"]+"
45WHITEREGEX = re.compile(WHITERE)
46
47# local errors
48class LexTokenError(Exception):
49    '''may happen on bad string'''
50class UnkTermError(Exception):
51    ''' ditto '''
52class BadPunctError(Exception):
53    ''' if try to make whitespace a punct '''
54class ParseInitError(Exception):
55    ''' shouldn't happen? '''
56class FlowError(Exception):
57    ''' shouldn't happen!!! (bug) '''
58class ReductError(Exception):
59    ''' shouldn't happen? '''
60class NondetError(Exception):
61    ''' shouldn't happen? '''
62
63# the end of file is interpreted in the lexical stream as
64# a terminal...
65#  this should be appended to the lexical stream:
66ENDOFFILETOKEN = (TERMFLAG, EOFFLAG)
67
68# in FSM use the following terminal to indicate eof
69ENDOFFILETERM = (ENDOFFILETOKEN, EOFFLAG)
70
71# Utility function for match conversion from regex to re
72def RMATCH(re, key, start=0):
73    #print "RMATCH: %s -> %s <- start=%s" % (re.pattern, key, start)
74    group = re.match(key, start)
75    if group is None:
76        #print "RMATCH: -1"
77        return -1
78    len = group.end() - group.start()
79    #print "RMATCH: %s (%s)" % (len, group.group())
80    return len
81
82# utility function for error diagnostics
83def DumpStringWindow(Str, Pos, Offset=15):
84    L = []
85    L.append("near ::")
86    start = Pos-Offset
87    end = Pos+Offset
88    if start<0: start = 0
89    if end>len(Str): end = len(Str)
90    L.append(`Str[start:Pos]`+"*"+`Str[Pos:end]`)
91    from string import join
92    return join(L, "\n")
93class LexDictionary:
94    '''Lexical dictionary class
95      this data structure is used by lexical parser below.
96
97      basic operations:
98         LD.punctuation(string)
99            registers a string as a punctuation
100              EG: LD.punctuation(":")
101         Punctuations are treated as a special kind of keyword
102         that is recognized even when not surrounded by whitespace.
103         IE, "xend" will not be recognized as "x end", but "x;" will be
104             recognized as "x ;" if "end" is a regular keyword but
105             ";" is a punctuation.  Only single character punctuations
106             are supported (now), ie, ":=" must be recognized as
107             ":" "=" above the lexical level.
108
109         LD.comment(compiled_reg_expression)
110            registers a comment pattern
111              EG LD.comment(regex.compile("--.*\n"))
112                asks to recognize ansi/sql comments like "-- correct?\n"
113
114         LD[compiled_reg_expression] = (TerminalFlag, Function) # assignment!
115            specifies a regular expression that should be associated
116            with the lexical terminal marker TerminalFlag
117              EG: LD[regex.compile("[0-9]+")] = ("integer",string.atoi)
118            the Function should be a function on one string argument
119            that interprets the matching string as a value. if None is
120            given, just the string itself will be used as the
121            interpretation.  (a better choice above would be a function
122            which "tries" atoi first and uses atol on overflow).
123         NOTE: ambiguity among regular expressions will be decided
124            arbitrarily (fix?).
125
126         LD[string] # retrieval!
127            returns ((KEYFLAG, Keywordstring), Keywordstring)
128             if the (entire) string matches a keyword or a
129             punctuation Keywordstring.
130            otherwise returns ((TERMFLAG, Terminalname), value)
131             if the (entire) string matches the regular expression for
132             a terminal flaged by Terminalname; value is the interpreted
133             value.  TerminalFlag better be something other than
134             KEYFLAG!
135            otherwise raises an error!
136            comments not filtered here!
137
138    the following additional functions are used for autodocumentation
139    in declaring rules, etcetera.
140        begin = LD.keyword("begin")
141           sets variable "begin" to (KEYFLAG, "BEGIN") if
142           "begin" maps to keyword "BEGIN" in LD
143        integer = LD.terminal("integer")
144           sets variable integer to ("integer", Function)
145           if  "integer" is a registered terminal Function is
146           its associated interpretation function.
147    '''
148
149    def __init__(self):
150        # commentpatterns is simply a list of compiled regular expressions
151        # that represent comments
152        self.commentpatterns = []
153        # commentstrings is used for debugging/dumping/reconstruction etc.
154        self.commentstrings = []
155        # punctuationlist is a string of punctuations
156        self.punctuationlist = ""
157        # keywordmap is a dictionary mapping recognized keyword strings
158        # and punctuations to their constant representations.
159        self.keywordmap = KeywordDict()
160        # regexprlist is a list of triples (regex,Flag,function) mapping
161        # regular expressions to their flag and interpreter function.
162        self.regexprlist = []
163
164    def Dump(self):
165        print "comments = ", self.commentstrings
166        print "punctuations = ", self.punctuationlist
167        print "keywordmap ="
168        self.keywordmap.Dump()
169        print "regexprlist =", self.regexprlist
170
171    def __getitem__(self, key):
172        # try to match string to a keyword
173        if self.keywordmap.has_key(key):
174            return self.keywordmap[key]
175
176        # try to match a regular expression
177        length = len(key)
178        for regexpr, flag, function in self.regexprlist:
179            index = RMATCH(regexpr, key)
180            if index == length:
181                break
182        else:
183            raise LexTokenError, "no match for string: " + `key`
184
185        # use the function to interpret the string, if given
186        if function != None:
187            value = function(key)
188        else:
189            value = key
190        return (flag, value)
191
192    def keyword(self,str):
193        ''' LD.keyword("this") will make a new keyword "this" if not found
194        '''
195        # upcase the string, if needed
196        if self.keywordmap.caseInsensitive:
197            str = string.upper(str)
198        if not self.keywordmap.has_key(str):
199            # redundancy for to avoid excess construction during parsing
200            token = (KEYFLAG,str)
201            self.keywordmap[str] = (token,str)
202        else:
203            (token, str2) = self.keywordmap[str]
204        return token
205
206    def terminal(self, string, RegExpr=None, Function=None):
207        ''' LD.terminal("this") will just look for "this"
208            LD.terminal("this", RE, F) will register a new terminal
209            RE must be a compiled regular expression or string reg ex
210            F must be an interpretation function
211        '''
212        if RegExpr != None and Function != None:
213            if type(RegExpr) == type(""):
214                RegExpr = re.compile(RegExpr)
215            self[ RegExpr ] = ( string, Function)
216        for regexpr, token, function in self.regexprlist:
217            if token[1] == string:
218                break
219        else:
220            raise UnkTermError, "no such terminal"
221        return token
222
223    def __setitem__(self,key,value):
224        if type(key) == type(''):
225            # if it's a string it must be a keyword
226            if self.keywordmap.caseInsensitive:
227                value = string.upper(value)
228                key = string.upper(key)
229            self.keywordmap[key] = ( (KEYFLAG, value), value)
230        else:
231            # otherwise it better be a compiled regular expression (not
232            #verified)
233            (Name, Function) = value
234            Flag = (TERMFLAG, Name)
235            regexpr = key
236            self.regexprlist.append((regexpr, Flag, Function))
237
238    def comment(self, string):
239        ''' register a regular expression as a comment
240        '''
241        # regexpr better be a uncompiled string regular expression!
242        # (not verified)
243        regexpr = re.compile(string)
244        self.commentpatterns = self.commentpatterns + [ regexpr ]
245        self.commentstrings = self.commentstrings + [ string ]
246
247    def punctuation(self,Instring):
248        ''' register a string as a punctuation
249        '''
250        if type(Instring) != type("") or len(Instring)!=1:
251            raise BadPunctError, "punctuation must be string of length 1"
252        if Instring in string.whitespace:
253            raise BadPunctError, "punctuation may not be whitespace"
254        self.punctuationlist = self.punctuationlist + Instring
255        return self.keyword(Instring)
256
257    def isCaseSensitive(self):
258        ''' testing and altering case sensitivity behavior
259        '''
260        return not self.keywordmap.caseInsensitive
261
262    def SetCaseSensitivity(self, Boolean):
263        ''' setting case sensitivity MUST happen before keyword
264            declarations!
265        '''
266        self.keywordmap.caseInsensitive = not Boolean
267
268    def Token(self, String, StartPosition):
269        ''' function to do same as __getitem__ above but looking _inside_ a
270            string instead of at the whole string
271
272            returns (token,skip) where token is one of
273             ((KEYFLAG,name),name) or ((TERMFLAG,termname),value)
274            and skip is the length of substring of string that matches thetoken
275        '''
276        finished = 0 # dummy, exit should be nonlocal
277        totalOffset = 0
278        while not finished:
279            # flag EOF if past end of string?
280            if len(String) <= StartPosition:
281                return (ENDOFFILETERM, 0)
282            # skip whitespace
283            whitespacefound = 0
284            skip = RMATCH(WHITEREGEX,String, StartPosition)
285            if skip > 0:
286                StartPosition = StartPosition + skip
287                totalOffset = totalOffset + skip
288                whitespacefound = 1
289            # try to find comment, keyword, term in that order:
290            # looking for comment
291            commentfound = 0
292            for commentexpr in self.commentpatterns:
293                offset = RMATCH(commentexpr,String,StartPosition)
294                if offset != -1:
295                    if offset<1:
296                        info = DumpStringWindow(String,StartPosition)
297                        raise LexTokenError, "zero length comment "+info
298                    commentfound = 1
299                    StartPosition = StartPosition + offset
300                    totalOffset = totalOffset + offset
301            # looking for a keyword
302            keypair = self.keywordmap.hasPrefix(String,StartPosition,
303                          self.punctuationlist)
304            if keypair != 0:
305                return ( keypair[0], keypair[1] + totalOffset)
306            # looking for terminal
307            for (regexpr, Flag, Function) in self.regexprlist:
308                offset = RMATCH(regexpr,String,StartPosition)
309                if offset != -1:
310                    matchstring = String[StartPosition : offset+StartPosition]
311                    if Function != None:
312                        value = Function(matchstring)
313                    else:
314                        value = matchstring
315                    return ((Flag, value) , offset + totalOffset)
316            if not (commentfound or whitespacefound):
317                info = DumpStringWindow(String,StartPosition)
318                raise LexTokenError, "Lexical parse failure "+info
319
320# alternate, experimental implementation
321
322class lexdictionary:
323
324    def __init__(self):
325        self.skip = ""
326        self.commentstrings = []
327        self.punctuationlist = ""
328        self.keywordmap = KeywordDict()
329        self.termlist = [] # list of (term, regex, flag, interpret_fn)
330        self.uncompiled = 1 # only compile after full initialization.
331        self.laststring= self.lastindex= self.lastresult = None
332
333    def Dump(self, *k):
334        raise "sorry", "not implemented"
335    __getitem__ = Dump
336
337    def keyword(self, str):
338        kwm = self.keywordmap
339        if kwm.caseInsensitive:
340            str = string.upper(str)
341        try:
342            (token, str2) = kwm[str]
343        except:
344            token = (KEYFLAG, str)
345            self.keywordmap[str] = (token,str)
346        return token
347
348    def terminal(self, str, regexstr=None, Function=None):
349        if regexstr is not None:
350            flag = (TERMFLAG, str)
351            self.termlist.append( (str, regexstr, flag, Function) )
352            return flag
353        else:
354            for (s,fl,fn) in self.termlist:
355                if fl[1]==str:
356                    return fl
357                else:
358                    raise UnkTermError, "no such terminal"
359
360    __setitem__ = Dump
361
362    def comment(self, str):
363        self.commentstrings.append(str)
364
365    def punctuation(self, Instring):
366        if type(Instring) != type("") or len(Instring)!=1:
367            raise BadPunctError, "punctuation must be string of length 1"
368        if Instring in string.whitespace:
369            raise BadPunctError, "punctuation may not be whitespace"
370        self.punctuationlist = self.punctuationlist + Instring
371        return self.keyword(Instring)
372
373    def SetCaseSensitivity(self, Boolean):
374        self.keywordmap.caseInsensitive = not Boolean
375
376    def Token(self, String, StartPosition):
377        # shortcut for reductions.
378        if self.laststring is String and self.lastindex == StartPosition:
379            #print "lastresult", self.lastresult
380            return self.lastresult
381        self.lastindex = StartPosition
382        self.laststring = String
383        #print `String[StartPosition: StartPosition+60]`
384
385        if self.uncompiled:
386            self.compile()
387            self.uncompiled = None
388        finished = 0
389        totalOffset = 0
390        skipprog = self.skipprog
391        keypairfn = self.keywordmap.hasPrefix
392        punctlist = self.punctuationlist
393        termregex = self.termregex
394        while not finished:
395            if len(String) <= StartPosition:
396                result = self.lastresult = (ENDOFFILETERM, 0)
397                return result
398            # skip ws and comments
399            #skip = skipprog.match(String, StartPosition)
400            skip = RMATCH(skipprog, String, StartPosition)
401            if skip>0:
402                if skip==0:
403                    info = DumpStringWindow(String, StartPosition)
404                    raise LexTokenError, \
405                        "zero length whitespace or comment "+info
406                StartPosition = StartPosition + skip
407                totalOffset = totalOffset + skip
408                continue
409            # look for keyword
410            keypair = keypairfn(String, StartPosition, punctlist)
411            if keypair!=0:
412                #print "keyword", keypair
413                result = self.lastresult = (keypair[0], keypair[1]+totalOffset)
414                return result
415            # look for terminal
416            #print "Termregex: %s --> %s <-- start=%s" % (termregex.pattern, String, StartPosition)
417            offset = termregex.match(String, StartPosition)
418            if offset is not None:
419                g = offset.group
420                for (term, regex, flag, fn) in self.termlist:
421                    test = g(term)
422                    if test:
423                        #print "terminal", test
424                        if fn is not None:
425                            value = fn(test)
426                        else:
427                            value = test
428                        result = self.lastresult = (
429                            (flag, value), offset.end() - offset.start() + totalOffset)
430                        return result
431            # error if we get here
432            info = DumpStringWindow(String, StartPosition)
433            raise LexTokenError, "Lexical token not found "+info
434
435    def isCaseSensitive(self):
436        return not self.keywordmap.caseInsensitive
437
438    def compile(self):
439        from string import joinfields, whitespace
440        import re
441        skipregexen = self.commentstrings + [WHITERE]
442        skipregex = "(" + joinfields(skipregexen, ")|(") + ")"
443        #print skipregex; import sys; sys.exit(1)
444        self.skipprog = re.compile(skipregex)
445        termregexen = []
446        termnames = []
447        for (term, rgex, flag, fn) in self.termlist:
448            fragment = "(?P<%s>%s)" % (term, rgex)
449            termregexen.append(fragment)
450            termnames.append(term)
451        termregex = joinfields(termregexen, "|")
452        self.termregex = re.compile(termregex)
453        self.termnames = termnames
454
455LexDictionary = lexdictionary ##### test!
456#XXX
457# a utility class: dictionary of prefixes
458#  should be generalized to allow upcasing of keyword matches
459class KeywordDict:
460
461    def __init__(self, caseInsensitive = 0):
462        self.FirstcharDict = {}
463        self.KeyDict = {}
464        self.caseInsensitive = caseInsensitive
465
466    def Dump(self):
467        if self.caseInsensitive:
468            print "  case insensitive"
469        else:
470            print "  case sensitive"
471        keys = self.KeyDict.keys()
472        print "  keyDict has ", len(keys), " elts"
473        for key in keys:
474            print "     ", key," maps to ",self.KeyDict[key]
475        firstchars = self.FirstcharDict.keys()
476        print "  firstcharDict has ", len(firstchars), " elts"
477        for char in firstchars:
478            print "     ", char," maps to ",self.FirstcharDict[char]
479
480    # set item assumes value has correct case already, if case sensitive
481    def __setitem__(self, key, value):
482        if len(key)<1:
483            raise LexTokenError, "Keyword of length 0"
484        if self.caseInsensitive:
485            KEY = string.upper(key)
486        else:
487            KEY = key
488        firstchar = KEY[0:1]
489        if self.FirstcharDict.has_key(firstchar):
490            self.FirstcharDict[firstchar] = \
491                self.FirstcharDict[firstchar] + [(KEY, value)]
492        else:
493            self.FirstcharDict[firstchar] = [(KEY, value)]
494        self.KeyDict[KEY] = value
495
496    # if String has a registered keyword at start position
497    #  return its canonical representation and offset, else 0
498    # keywords that are not punctuations should be
499    # recognized only if followed
500    # by a punctuation or whitespace char
501    #
502    def hasPrefix(self,String,StartPosition,punctuationlist):
503        First = String[StartPosition:StartPosition+1]
504        fcd = self.FirstcharDict
505        caseins = self.caseInsensitive
506        if caseins:
507            First = string.upper(First)
508        if fcd.has_key(First):
509            Keylist = fcd[First]
510        else:
511            return 0
512        for (key,value) in Keylist:
513            offset = len(key)
514            EndPosition = StartPosition+offset
515            match = String[StartPosition : EndPosition]
516            if caseins:
517                match = string.upper(match)
518            if key == match:
519                if len(key)==1 and key in punctuationlist:
520                    # punctuations are recognized regardless of nextchar
521                    return (value,offset)
522                else:
523                    # nonpuncts must have punct or whitespace following
524                    #(uses punct as single char convention)
525                    if EndPosition == len(String):
526                        return (value, offset)
527                    else:
528                        nextchar = String[EndPosition]
529                        if nextchar in string.whitespace\
530                            or nextchar in punctuationlist:
531                            return (value, offset)
532        return 0 # if no exit inside for loop, fail
533
534    def __getitem__(self,key):
535        if self.caseInsensitive:
536            key = string.upper(key)
537        return self.KeyDict[key]
538
539    def has_key(self,key):
540        if self.caseInsensitive:
541            key = string.upper(key)
542        return self.KeyDict.has_key(key)
543
544# LexStringWalker walks through a string looking for
545# substrings recognized by a lexical dictionary
546#
547#  ERROR REPORTING NEEDS IMPROVEMENT
548class LexStringWalker:
549
550    def __init__(self, String, LexDict):
551        self.Position = 0
552        self.NextPosition = 0
553        self.String = String
554        self.LexDict = LexDict
555        self.PastEOF = 0
556        self.Done = 0
557
558    def DUMP(self):
559        return DumpStringWindow(self.String,self.Position)
560
561    #reset not defined
562
563    def more(self):
564        return not self.PastEOF
565
566    def getmember(self):
567        (Token,skip) = self.LexDict.Token(self.String, self.Position)
568        self.NextPosition = self.Position + skip
569        if Token == ENDOFFILETERM:
570            self.PastEOF = 1
571        return Token
572
573    def next(self):
574        if self.Done:
575            data = self.DUMP()
576            raise LexTokenError, "no next past end of file "+data
577        elif self.PastEOF:
578            self.Done=1
579        elif self.NextPosition > self.Position:
580            self.Position = self.NextPosition
581        else:
582            dummy = self.getmember()
583            if self.NextPosition <= self.Position:
584                data = self.DUMP()
585                raise LexTokenError, "Lexical walker not advancing "+data
586            self.Position = self.NextPosition
587
588class ParserObj:
589    ''' the parse class:
590          Based loosely on Aho+Ullman, Principles of Compiler Design, Ch.6.
591           except that they don't describe how to handle boundary
592           conditions, I made them up myself.
593
594          Note: This could be implemented using just functions; it's implemented
595           as a class to facilitate diagnostics and debugging in case of
596           failures of various sorts.
597
598        a parse accepts
599          a rule list
600
601          a lexically analysed stream with methods
602            stream.getmember()  returns the current token on the stream
603            stream.next()  moves on to next token
604            stream.more()     returns false if current token is the last token
605
606          and a FSM (finite state machine) with methods
607            FSM.root_nonTerminal
608              the nonterminal at which to start parsing
609            FSM.initial_state
610              the initial state to start at
611            FSM.successful_final_state
612              the final state to go to upon successful parse
613            FSM.map(Current_State,Current_Token)
614              returns either
615                 (TERMFLAG, 0)
616                    if Current_State is terminal (final or reduction).
617                 (NOMATCHFLAG, 0)
618                    if Current_State is nonterminal, but the Current_Token
619                    and Next_Token do not lead to a valid state in the FSM
620                 (MOVETOFLAG, Next_State)
621                    if Current_State is nonterminal and Current_Token,
622                    Next_token map to Next_State from Current_State.
623                 (REDUCEFLAG, Rulenum)
624                    if Current_State indicates a reduction at Current_Token
625                    for rule Rule number Rule
626
627           and a Stack with methods (replaced with dictionary)
628                 (init: {-1:0} )
629              Stack.Top() returns top of stack (no pop)
630                 ( Stack[Stack[-1]] )
631              Stack.Push(Object)
632                 ( Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=Object )
633              Stack.MakeEmpty()
634                 ( Stack[-1]=0 )
635              Stack.IsEmpty()
636                 ( Stack[-1] == 0 )
637              Stack.Pop()
638                 ( Stack[-1] = Stack[-1]-1 )
639              stack contents created by Parser will be of form (State,Value)
640              where Value was inserted at FSM state State.
641              Value of form either (KEYFLAG, Name)
642                                   (NontermName, reductionvalue)
643                                or (TerminalName, value)
644
645           and an optional parameter Evaluate which if 0 indicates that
646              rules should be evaluated, otherwise indicates that rules
647              should just be reduced and the reduction structure should
648              be used as the result of the rule
649
650        rule objects must support methods
651           Rule.reduce(Stack)
652              pops off the elements corresponding to the body of the Rule
653              from the stack and returns (NewStack,Red) where NewStack is
654              the stack minus the body and Red is the result of evaluating the
655              reduction function on this instance of the rule.
656           Rule.Nonterm
657              the nonterminal at the head of the rule
658    '''
659
660    # Evaluate determines whether rules should be evaluated
661    # after reductions.  Context is an argument passed to the
662    # list reduction function
663    #
664    def __init__(self, Rulelist, Stream, FSM, Stack, Evaluate=1, Context=None):
665        self.Rules = Rulelist
666        self.LexStream = Stream
667        self.FSM = FSM
668        self.Stack = Stack
669        self.Context = Context
670
671        # start with empty stack, initial_state, no nonterminal
672        #self.Stack[-1] = 0#   self.Stack.MakeEmpty()
673        self.Stack[:] = []
674        self.State = FSM.initial_state
675        self.currentNonterm = None
676        self.Evaluate = Evaluate
677
678    def DoOneReduction(self):
679        ''' DoOneReduction accepts tokens from the stream and pushes
680            them onto the stack until a reduction state is reached.
681
682            Resolve the reduction
683        '''
684        current=self.State
685        FSM=self.FSM
686        Stack = self.Stack
687        Context = self.Context
688        Stream = self.LexStream
689        # the internal FSM.StateTokenMap dictionary is used directly here.
690        STMap = FSM.StateTokenMap
691        #if FSM.final_state(current):
692        #   raise ParseInitError, 'trying to reduce starting at final state'
693
694        tokenVal = Stream.getmember()
695        #print "tokenVal", tokenVal
696        token = tokenVal[0]
697
698        # push the token and traverse FSM until terminal state is reached
699        #(flag, nextThing) = FSM.map(current, token)
700        key = (current, token)
701        try:
702            (flag, nextThing) = STMap[key][0]
703        except KeyError:
704            flag = NOMATCHFLAG
705
706        while flag == MOVETOFLAG:
707            nextState = nextThing
708            #print current, " shift ", token,
709            # no sanity check, possible infinite loop
710
711            # push current token and next state
712            ThingToPush = (nextState, tokenVal)
713            #print "pushing ", ThingToPush
714            #Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush
715            Stack.append(ThingToPush)
716            #Stack.Push( ThingToPush )
717
718            # move to next token, next state
719            Stream.next()
720            # error if end of stream
721            if not Stream.more(): # optimized Stream.PastEOF (?)
722                data = Stream.DUMP()
723                raise EOFError, 'end of stream during parse '+data
724
725            current = nextState
726            tokenVal = Stream.getmember()
727            token = tokenVal[0]
728
729            #MAP = FSM.map(current,token)
730            key = (current, token)
731            try:
732                (flag, nextThing) = STMap[key][0]
733            except KeyError:
734                flag = NOMATCHFLAG
735
736        # at end of while loop we should be at a reduction state
737
738        if flag == REDUCEFLAG:
739            rulenum = nextThing
740            #print current, " reduce ", token, self.Rules[rulenum]
741            # normal case
742            # perform reduction
743            rule = self.Rules[rulenum]
744            Nonterm = rule.Nonterm
745            self.currentNonterm = Nonterm
746            (Stack, reduct) = rule.reduce( Stack , Context )
747            GotoState = self.GotoState(rule)
748            # push the Gotostate and result of rule reduction on stack
749            ThingToPush = (GotoState, (Nonterm, reduct) )
750            # push the result of the reduction and exit normally
751            #print "pushing ", ThingToPush
752            #Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush
753            Stack.append(ThingToPush)
754            #Stack.Push(ThingToPush)
755            self.State=GotoState
756            return 1  # normal successful completion
757
758        # some error cases
759        elif flag == NOMATCHFLAG:
760            self.ParseError(current,tokenVal, "nomatch1")
761        else:
762            data = Stream.DUMP()
763            s = """
764               flag = %s
765               map = %s """ % (flag, FSM.map(current,token))
766            data = data + s
767            raise FlowError, 'unexpected else '+data
768    def GotoState(self, rule):
769        ''' compute the state to goto after a reduction is performed on a rule.
770            Algorithm: determine the state at beginning of reduction
771             and the next state indicated by the head nonterminal of the rule.
772             special case: empty stack and root nonterminal > success.
773        '''
774        FSM = self.FSM
775        Stack = self.Stack
776        Head = rule.Nonterm
777        if len(Stack)==0: #Stack[-1]==0: #Stack.IsEmpty():
778            BeforeState = FSM.initial_state
779        else:
780            BeforeState = Stack[-1][0] #Stack[Stack[-1]][0] #Stack.Top()[0]
781         # is this right? if the stack is empty and the Head
782         # is the root nonterm, then goto is final state
783        if len(Stack)==0 and Head == FSM.root_nonTerminal:#Stack.isEmpty()
784            Result = FSM.successful_final_state
785        else:
786            # consider eliminating the call to .map here? (efficiency)
787            (flag, Result) = FSM.map(BeforeState, Head)
788            if flag != MOVETOFLAG:
789                #FSM.DUMP()
790                self.ParseError(BeforeState, Head, "notmoveto")
791        return Result
792
793    def ParseError( self, State, Token, *rest):
794        # make this parse error nicer (add diagnostic methods?)
795        L = [""]
796        L.append("*******************************")
797        L.append("current state = "+`State`)
798        L.append("expects: ")
799        expects = ""
800        for (flag,name) in self.FSM.Expects(State):
801            if flag in (TERMFLAG, KEYFLAG):
802                expects = expects + `name`+ ", "
803        L.append(expects)
804        L.append(`rest`)
805        L.append("current token = " + `Token`)
806        #print "Stack =",
807        #self.StackDump(5)
808        #print
809        from string import join
810        data = self.LexStream.DUMP() + join(L, "\n")
811        raise SyntaxError, 'unexpected token sequence.' + data
812
813    def StackDump(self, N):
814        Stack = self.Stack
815        Topkey = len(Stack)
816        if Topkey>N:
817            Start = Topkey - N
818        else:
819            Start = 1
820        for i in range(Start,Topkey+1):
821            print " :: ", Stack[i],
822
823    def GO(self):
824        '''execute parsing until done
825        '''
826        while self.State != self.FSM.successful_final_state:
827            self.DoOneReduction()
828        # should I check that stack has only one elt here?
829        # return result of last reduction
830        return self.Stack[-1][1] #self.Stack.Top()[1]
831
832def nonterminal(string):
833    ''' function for declaring a variable to represent a nonterminal:
834         eg Program = nonterminal("program")
835          included for convenient autodocumentation
836    '''
837    return (NONTERMFLAG, string)
838
839def termrep(string):
840    ''' declaring a terminal WITHOUT INSTALLING IT IN A LexDict
841    '''
842    return (TERMFLAG, string)
843
844def DefaultReductFun( RuleResultsList, Context ):
845    ''' used as a default reduction function for rules
846    '''
847    if WARNONDEFAULTS:
848        print "warn: default reduction."
849        print "   ", RuleResultsList
850    return RuleResultsList
851
852class ParseRule:
853    ''' the rule class
854          a rule is defined by a goal nonterminal marker of form
855            (NONTERMFLAG, Name)
856          and a list defining the body which must contain elts of form
857            (KEYFLAG, Name) or (NONTERMFLAG, Name) of (TERMFLAG, Name)
858          and a reduction function which takes a list of the same size
859          as the BodyList (consisting of the results of the evaluations of
860          the previous reductions)
861          and returns an interpretation for the body
862    '''
863    def __init__(self, goalNonTerm, BodyList, \
864         ReductFunction = DefaultReductFun):
865        #print BodyList
866        # check some of the arguments (very limited!)
867        if len(goalNonTerm) != 2 or goalNonTerm[0] != NONTERMFLAG:
868            raise TypeError, "goal of rule must be nonterminal"
869        for m in BodyList:
870            #print m
871            if len(m) != 2:
872                raise TypeError, "invalid body form for rule"
873        self.Nonterm = goalNonTerm
874        self.Body = BodyList
875        self.ReductFun = ReductFunction
876
877    def __repr__(self):
878        return THISMODULE + ".ParseRule" + `self.components()`
879
880    def components(self):
881        ''' marshal-able components of a rule
882        '''
883        return (self.Nonterm, self.Body)
884
885    def reduce(self, Stack, Context=None):
886        ''' rule.reduce(Stack) pops of the stack elements corresponding
887            to the body of the rule and prepares the appropriate reduction
888            object for evaluation (or not) at higher levels
889        '''
890        #print "reducing", Stack
891        Blength = len(self.Body)
892        #print Blength, len(self.Body)
893        # pop off previous results from stack corresponding to body
894        BodyResults = [None] * Blength
895        #BodyNames = [None] * Blength # for debug
896        #print "popping: "
897        for i in range(1,Blength+1):
898            Bindex = Blength - i  # stack contents pop off in reverse order
899
900            # get and destructure the rule body entry
901            RuleEntry = self.Body[Bindex]
902            ( REkind , REname ) = RuleEntry
903
904            # get and destructure the stack entry
905            PoppedValue = Stack[-i] #Stack.Top()
906            #print PoppedValue,
907            #del Stack[-1]# = Stack[-1]-1 #Stack.Pop()
908            SETokVal = PoppedValue[1]
909            SEvalue = SETokVal[1]
910            SEname = SETokVal[0][1]
911
912            # the names from rule and stack must match (?)
913            if SEname != REname:
914                print SEname, REname
915                print self
916                raise ReductError, " token names don't match"
917
918            # store the values for the reduction
919            BodyResults[Bindex] = SEvalue
920            #BodyNames[Bindex] = SEname # debug
921
922        del Stack[len(Stack)-Blength:]
923        #print "reduced", Stack
924        #print
925        # evaluate the reduction, in context
926        reduct = self.ReductFun(BodyResults, Context)
927        if WARNONDEFAULTS and self.ReductFun is DefaultReductFun:
928            # should check whether name is defined before this...
929            print "  default used on ", self.Name
930        #Reduction( self.ReductFun, BodyResults, BodyNames )
931        return (Stack, reduct)
932
933
934def PrintDefaultBindings(rulelist):
935    ''' for debugging: look through a rule list and print names of rules
936        that have default binding
937    '''
938    for r in rulelist:
939        if r.ReductFun is DefaultReductFun:
940            print r.Name
941
942class FSMachine:
943    def __init__(self, rootNonTerm):
944        # start and success state conventions
945        startState=1
946        successState=0
947
948        self.root_nonTerminal = rootNonTerm
949        self.initial_state = startState
950        self.successful_final_state = successState
951
952        # the list of states of the FSM, implemented as a dictionary
953        #  entries are identified by their index
954        #  content is
955        #  a list whose first elt is either TRANSFLAG, or TERMFLAG
956        #  other list elts may be added by other layers (parse generator)
957        #  indicating the kind of the state.
958        self.States = {}
959
960        # allocate start and success states
961        self.States[startState]=[TRANSFLAG]
962        self.States[successState]=[TERMFLAG]
963
964        # the most recently allocated state
965        self.maxState= startState
966
967        # the map of current token+state number to next state
968        #with entries of form (tokenname,state):nextstate_sequence
969        #
970        self.StateTokenMap = {}
971
972    def DUMP(self, DumpMapData=1, DumpStateData=1, ForbiddenMark={}):
973        ''' ForbiddenMark is for filtering out maps to an error state
974        '''
975        print "root nonterminal is ", self.root_nonTerminal
976        print "start at ", self.initial_state
977        print "end at ", self.successful_final_state
978        print "number of states: ", self.maxState
979        if DumpStateData:
980            print
981            for State in range(0,self.maxState+1):
982                Data = self.States[State]
983                print State, ": ", Data
984        if DumpMapData:
985            print
986            for key in self.StateTokenMap.keys():
987                map = self.StateTokenMap[key]
988                if map[0][0] == MOVETOFLAG:
989                    ToStateData = self.States[map[0][1]]
990                    if len(ToStateData) < 2:
991                        Mark = None
992                    else:
993                        Mark = ToStateData[1]
994                    if Mark != ForbiddenMark:
995                        print key, " > ", map, " = ", ToStateData
996                else:
997                    print key, " > reduction to rule number ", map[0][1]
998
999    def Expects(self, State):
1000        ''' what tokens does a state expect?
1001        '''
1002        keys = self.StateTokenMap.keys()
1003        Tokens = kjSet.NewSet( [] )
1004        for (state1,token) in keys:
1005            if State == state1:
1006                kjSet.addMember(token,Tokens)
1007        return kjSet.get_elts(Tokens)
1008
1009    def NewState(self, kind, AdditionalInfo = []):
1010        ''' "allocate" a new state of specified kind
1011              kind must either be TRANSFLAG, TERMFLAG or a rule object
1012            returns the number of the new state
1013        '''
1014        if not kind in (TRANSFLAG,TERMFLAG,REDUCEFLAG):
1015            raise TypeError, "unknown state kind"
1016        available = self.maxState+1
1017
1018        self.States[available] = [kind] + AdditionalInfo
1019        self.maxState = available
1020        return available
1021
1022    def SetReduction(self, fromState, TokenRep, Rulenum):
1023        ''' Install a reduction transition in the FSM:
1024            a reduction is represented by mapping to a rule index
1025            no nondeterminism is allowed.
1026        '''
1027        key = (fromState, TokenRep)
1028        if not self.StateTokenMap.has_key(key):
1029            self.StateTokenMap[ key ] = ((REDUCEFLAG, Rulenum),)
1030        else:
1031            raise ReductError, "attempt to set ambiguous reduction"
1032
1033    def SetMap(self, fromState, TokenRep, toState):
1034        ''' Install a "shift" or "goto transition in the FSM:
1035            supports nondeterminism by storing a sequence of possible
1036            transitions
1037        '''
1038        key = (fromState, TokenRep)
1039        if self.StateTokenMap.has_key(key):
1040            Old = self.StateTokenMap[key]
1041            if Old[0][0] != MOVETOFLAG:
1042                # if the old value was not an integer, not a "normal state":
1043                # complain:
1044                raise NondetError, \
1045                    "attempt to make inappropriate transition ambiguous"
1046            self.StateTokenMap[key] = Old + ((MOVETOFLAG,toState),)
1047        else:
1048            self.StateTokenMap[key] = ((MOVETOFLAG,toState),)
1049
1050    def map(self, current_state, current_token):
1051        ''' Find the action indicated by fsm on
1052             (current_state, current_token) input.
1053
1054            note: in the event of nondeterministic choice this chooses
1055              the first possibility listed.
1056            ParseObj.DoOneReduction() currently uses the internal structure
1057             of StateTokenMap directly, rather than using this function.
1058        '''
1059        StateEntry = self.States[current_state][0]
1060        if StateEntry == TERMFLAG:
1061            return (TERMFLAG, 0)
1062        elif StateEntry == TRANSFLAG:
1063            # try to find a transition for this token and state
1064            key = (current_state, current_token)
1065            try:
1066                TMap = self.StateTokenMap[key]
1067                return TMap[0]
1068            except KeyError:
1069                return (NOMATCHFLAG, 0)
1070        else:
1071            raise FlowError, "unexpected else (2)"
1072
1073class Grammar:
1074    ''' the grammar class:
1075          a grammar consists of
1076            - a LexDict lexical dictionary;
1077            - a deterministic FSMachine;
1078            - a Rulelist
1079          and optionally a dictionary that maps Rulenames
1080          to Rulelist indices (used for dumping and externally)
1081    '''
1082    def __init__(self, LexD, DFA, RuleL, RuleNameDict = None):
1083        # for auto initialization set LexD,DFA,RuleL to None
1084        if LexD == None and DFA == None and RuleL == None:
1085            self.LexD = LexDictionary()
1086            # use a dummy root nonterminal -- must fix elsewhere!
1087            self.DFA = FSMachine("ERROR")
1088            self.RuleL = []
1089        else:
1090            self.LexD = LexD
1091            self.DFA = DFA
1092            self.RuleL = RuleL
1093        if RuleNameDict != None:
1094            self.AddNameDict(RuleNameDict)
1095        self.CleanUp()
1096
1097    def PrintDefaults(self):
1098        ''' look for default bindings
1099        '''
1100        print "Default bindings on:"
1101        PrintDefaultBindings(self.RuleL)
1102
1103    def SetCaseSensitivity( self, Boolean ):
1104        ''' setting case sensitivity: must happen before keyword installation
1105            in LexD.
1106        '''
1107        self.LexD.SetCaseSensitivity( Boolean )
1108
1109    def CleanUp(self):
1110        ''' this may be silly, but to save some space in construction
1111            a token dictionary may be used that facilitates sharing of
1112            token representations.  This method either initializes
1113            the dictionary or disposes of it if it exists
1114        '''
1115        self.IndexToToken = {}
1116        # this dictionary is used by automatically
1117        # generated grammars to determine whether
1118        # a string represents a nonterminal
1119        self.NonTermDict = {}
1120        # similarly for terminals
1121        self.TermDict = {}
1122        # this string may be used to keep a printable
1123        # representation of the rules of the grammar
1124        # (usually in automatic grammar generation
1125        self.RuleString = ""
1126
1127    # to associate a token to an integer use
1128    # self.IndexToToken[int] = tokenrep
1129    def AddNameDict(self, RuleNameDict):
1130        ''' this method associates rules to names using a
1131            RuleNameDict dictionary which maps names to rule indices.
1132            after invocation
1133              self.RuleNameToIndex[ name ] gives the index
1134                in self.RuleL for the rule associated with name, and
1135              self.RuleL[index].Name gives the name associated
1136                with the rule self.RuleL[index]
1137        '''
1138        self.RuleNameToIndex = RuleNameDict
1139        # add a Name attribute to the rules of the rule list
1140        for ruleName in RuleNameDict.keys():
1141            index = RuleNameDict[ ruleName ]
1142            self.RuleL[ index ].Name = ruleName
1143
1144    def DoParse( self, String, Context = None, DoReductions = 1 ):
1145        ''' parse a string using the grammar, return result and context
1146        '''
1147        # construct the ParserObj
1148        Stream = LexStringWalker( String, self.LexD )
1149        Stack = [] # {-1:0} #Walkers.SimpleStack()
1150        ParseOb = ParserObj( self.RuleL, Stream, self.DFA, Stack, \
1151                         DoReductions, Context )
1152        # do the parse
1153        ParseResult = ParseOb.GO()
1154        # return final result of reduction and the context
1155        return (ParseResult[1], Context)
1156
1157    def DoParse1( self, String, Context=None, DoReductions=1 ):
1158        ''' parse a string using the grammar, but only return
1159            the result of the last reduction, without the context
1160        '''
1161        return self.DoParse(String, Context, DoReductions)[0]
1162
1163    def Bind( self, Rulename, NewFunction ):
1164        ''' if the Name dictionary has been initialized
1165            this method will (re)bind a reduction function to
1166            a rule associated with Rulename
1167        '''
1168        ruleindex = self.RuleNameToIndex[ Rulename ]
1169        rule = self.RuleL[ ruleindex ]
1170        rule.ReductFun = NewFunction
1171
1172    def Addterm( self, termname, regexpstr, funct ):
1173        ''' bind a terminal to a regular expression and interp function
1174            in the lexical dictionary (convenience)
1175        '''
1176        self.TermDict[termname] =self.LexD.terminal(termname, regexpstr, funct)
1177
1178def NullGrammar():
1179    ''' function to create a "null grammar"
1180    '''
1181    return Grammar(None,None,None,{})
1182
1183def UnMarshalGram(file):
1184    ''' unmarshalling a marshalled grammar created by
1185          buildmodule.CGrammar.MarshalDump(Tofile)
1186          tightly coupled with buildmodule code...
1187        file should be open and "pointing to" the marshalled rep.
1188
1189        warning: doesn't bind semantics!
1190    '''
1191    Grammar = NullGrammar()
1192    UnMarshal = UnMarshaller(file, Grammar)
1193    UnMarshal.MakeLex()
1194    UnMarshal.MakeRules()
1195    UnMarshal.MakeTransitions()
1196    UnMarshal.Cleanup()
1197    return UnMarshal.Gram
1198
1199class UnMarshaller:
1200    ''' unmarshalling object for unmarshalling grammar from a python module
1201    '''
1202    def __init__(self, modulename, Grammar):
1203        import marshal
1204        self.Gram = Grammar
1205        marfile = __import__(modulename)
1206        for entry in modulename.split('.')[1:]:
1207            marfile = getattr(marfile, entry)
1208        self.tokens = marfile.tokens
1209        self.punct = marfile.punct
1210        self.comments = marfile.comments
1211        self.RuleTups = marfile.RuleTups
1212        self.MaxStates = marfile.MaxStates
1213        self.reducts = marfile.reducts
1214        self.moveTos = marfile.moveTos
1215        self.Root = marfile.Root
1216        self.CaseSensitivity = marfile.CaseSensitivity
1217
1218        Grammar.SetCaseSensitivity(self.CaseSensitivity)
1219
1220    def MakeLex(self):
1221        Grammar=self.Gram
1222        LexD = Grammar.LexD
1223        # punctuations
1224        LexD.punctuationlist = self.punct
1225        # comments
1226        for commentregex in self.comments:
1227            LexD.comment(commentregex)
1228        #LexD.commentstring = self.comments
1229        # keywords, terminals, nonterms
1230        #   rewrite the tokens list for sharing and extra safety
1231        LexTokens = {}
1232        tokens = self.tokens
1233        for tokenindex in range(len(tokens)):
1234            (kind,name) = tokens[tokenindex]
1235            if kind == KEYFLAG:
1236                tokens[tokenindex] = LexD.keyword(name)
1237            elif not kind in [TERMFLAG, NONTERMFLAG]:
1238                raise FlowError, "unknown token type"
1239        # not needed
1240        self.tokens = tokens
1241
1242    def MakeRules(self):
1243        Grammar = self.Gram
1244        Grammar.DFA.root_nonTerminal = self.Root
1245        NameIndex = Grammar.RuleNameToIndex
1246        RuleTuples = self.RuleTups
1247        nRules = len(RuleTuples)
1248        RuleList = [None] * nRules
1249        for index in range(nRules):
1250            (Name, Components) = RuleTuples[index]
1251            rule = apply(ParseRule, Components)
1252            rule.Name = Name
1253            RuleList[index] = rule
1254            NameIndex[Name] = index
1255        Grammar.RuleL = RuleList
1256
1257    def MakeTransitions(self):
1258        Grammar = self.Gram
1259        DFA = Grammar.DFA
1260        StateTokenMap = DFA.StateTokenMap
1261        tokens = self.tokens
1262        # record the state number
1263        DFA.maxState = self.MaxStates
1264        # this is historical, unfortunately...  CLEAN IT UP SOMEDAY!
1265        # THE DFA.States DICT IS NOT NEEDED (?) (here)
1266        for state in range(1, self.MaxStates+1):
1267            DFA.States[state] = [TRANSFLAG]
1268        # record the reductions
1269        for (fromState, TokenIndex, rulenum) in self.reducts:
1270            DFA.SetReduction(fromState, tokens[TokenIndex], rulenum)
1271        # record the transitions
1272        for (fromState, TokenIndex, ToState) in self.moveTos:
1273            DFA.SetMap(fromState, tokens[TokenIndex], ToState)
1274
1275    def Cleanup(self):
1276        Grammar = self.Gram
1277        Grammar.CleanUp()
1278
1279#
1280# $Log: kjParser.py,v $
1281# Revision 1.1  2005/06/05 05:51:05  jhorman
1282# initial checkin
1283#
1284# Revision 1.5  2002/05/11 02:59:05  richard
1285# Added info into module docstrings.
1286# Fixed docco of kwParsing to reflect new grammar "marshalling".
1287# Fixed bug in gadfly.open - most likely introduced during sql loading
1288# re-work (though looking back at the diff from back then, I can't see how it
1289# wasn't different before, but it musta been ;)
1290# A buncha new unit test stuff.
1291#
1292# Revision 1.4  2002/05/08 00:49:00  anthonybaxter
1293# El Grande Grande reindente! Ran reindent.py over the whole thing.
1294# Gosh, what a lot of checkins. Tests still pass with 2.1 and 2.2.
1295#
1296# Revision 1.3  2002/05/07 07:06:11  richard
1297# Cleaned up sql grammar compilation some more.
1298# Split up the BigList into its components too.
1299#
1300# Revision 1.2  2002/05/06 23:27:10  richard
1301# . made the installation docco easier to find
1302# . fixed a "select *" test - column ordering is different for py 2.2
1303# . some cleanup in gadfly/kjParseBuild.py
1304# . made the test modules runnable (remembering that run_tests can take a
1305#   name argument to run a single module)
1306# . fixed the module name in gadfly/kjParser.py
1307#
1308# Revision 1.1.1.1  2002/05/06 07:31:09  richard
1309#
1310#
1311#
Note: See TracBrowser for help on using the browser.