about summary refs log tree commit diff
path: root/third_party/lisp/npg/src/parser.lisp
blob: c15d26fe394e2b90403e680ad687193527bad940 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
;;;  parser.lisp --- runtime parser

;;;  Copyright (C) 2003-2006, 2009 by Walter C. Pelissero

;;;  Author: Walter C. Pelissero <walter@pelissero.de>
;;;  Project: NPG a Naive Parser Generator

#+cmu (ext:file-comment "$Module: parser.lisp $")

;;; This library is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Lesser General Public License
;;; as published by the Free Software Foundation; either version 2.1
;;; of the License, or (at your option) any later version.
;;; This library is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; Lesser General Public License for more details.
;;; You should have received a copy of the GNU Lesser General Public
;;; License along with this library; if not, write to the Free
;;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
;;; 02111-1307 USA

;;;  Commentary:
;;;
;;; This is the runtime part of the parser.  The code that is
;;; responsible to execute the parser defined with the primitives
;;; found in define.lisp.

(in-package :naive-parser-generator)

(defvar *debug* nil
  "Either nil or a stream where to write the debug informations.")
#+debug (declaim (fixnum *maximum-recursion-depth*))
#+debug (defvar *maximum-recursion-depth* 1000
  "Maximum depth the parser is allowed to recursively call itself.
This is the only way for the parser to detect a loop in the grammar.
Tune this if your grammar is unusually complex.")

(declaim (inline reduce-production))
(defun reduce-production (production arguments)
  "Apply PRODUCTION's action on ARGUMENTS.  This has the effect of
  \"reducing\" the production."
  (when *debug*
    (format *debug* "reducing ~S on ~S~%" production arguments))
  (flet ((safe-token-value (token)
           (if (token-p token)
               (token-value token)
               token)))
    (apply (prod-action production) (mapcar #'safe-token-value arguments))))

(defgeneric later-position (pos1 pos2)
  (:documentation
   "Compare two file postions and return true if POS1 is later than
POS2 in the input stream."))

;; This is meant to be overloaded in the lexer
(defmethod later-position ((pos1 integer) (pos2 integer))
  (> pos1 pos2))

;; this looks silly but turns out to be useful (see below)
(defmethod later-position (pos1 pos2)
  (and (eq pos1 :eof) (not (eq pos2 :eof))))

(defgeneric read-next-tokens (tokens-source)
  (:documentation "Read next token from a lexical analysed stream.  The nature of
TOKENS-SOURCE is implementation dependent and any lexical analyzer is
supposed to specialise this method."))

;; This is the actual parser.  the algorithm is pretty
;; straightforward, the execution of the reductions a bit less.  Error
;; recovery is rather clumsy.

(defun parse (grammar start tokenizer)
  "Match a GRAMMAR against the list of input tokens coming from TOKENIZER.
Return the reduced values according to the nonterminal actions.  Raise
an error on failure."
  (declare (type grammar grammar)
           (type symbol start))
  (labels
      ((match-token (expected token)
         (when *debug*
           (format *debug* "match-token ~S ~S -> " expected token))
         (let ((res (cond ((symbolp expected)
                           ;; non-costant terminal (like identifiers)
                           (eq expected (token-type token)))
                          ((and (stringp expected)
                                (stringp (token-value token)))
                           ;; string costant terminal
                           (funcall (the function (grammar-equal-p grammar)) expected (token-value token)))
                          ((functionp expected)
                           ;; custom equality predicate (must be able
                           ;; to deal with token objects)
                           (funcall expected token))
                          ;; all the rest
                          (t (equal expected (token-value token))))))
           (when *debug*
             (format *debug* "~Amatched~%" (if res "" "not ")))
           res))
       (match (expected matched #+debug depth)
         (declare (list expected matched)
                  #+debug (fixnum depth))
         (let ((first-expected (car expected)))
           (cond #+debug ((> depth *maximum-recursion-depth*)
                  (error "endless recursion on ~A ~A at ~A expecting ~S"
                         (token-type (car matched)) (token-value (car matched))
                         (token-position (car matched)) expected))
                 ((eq first-expected :any)
                  (match (cdr expected) (cdr matched) #+debug depth))
                 ;; This is a trick to obtain partial parses.  When we
                 ;; reach this expected token we assume we succeeded
                 ;; the parsing and return the remaining tokens as
                 ;; part of the match.
                 ((eq first-expected :rest)
                  ;; we could be at the end of input so we check this
                  (unless (cdr matched)
                    (setf (cdr matched) (list :rest)))
                  (list nil nil))
                 ((rule-p first-expected)
                  ;; If it's a rule, then we try to match all its
                  ;; productions.  We return the first that succeeds.
                  (loop
                     for production in (rule-productions first-expected)
                     for production-tokens of-type list = (prod-tokens production)
                     with last-error-position = nil
                     with last-error = nil
                     for (error-position error-descr) =
                       (progn
                         (when *debug*
                           (format *debug* "trying to match ~A: ~S~%"
                                   (rule-name first-expected) production-tokens))
                         (match (append production-tokens (cdr expected)) matched #+debug (1+ depth)))
                     do (cond ((not error-position)
                               (return (let ((args-count (prod-tokens-length production)))
                                         (setf (cdr matched)
                                               (cons (reduce-production
                                                      production
                                                      (subseq (the list (cdr matched)) 0 args-count))
                                                     (nthcdr (1+ args-count) matched)))
                                         (list nil nil))))
                              ((or (not last-error)
                                   (later-position error-position last-error-position))
                               (setf last-error-position error-position
                                     last-error error-descr)))
                     ;; if everything fails return the "best" error
                     finally (return (list last-error-position
                                           (if *debug*
                                               #'(lambda ()
                                                   (format nil "~A, trying to match ~A"
                                                           (funcall (the function last-error))
                                                           (rule-name first-expected)))
                                               last-error)))))
                 (t
                  ;; if necessary load the next tokens
                  (when (null (cdr matched))
                    (setf (cdr matched) (read-next-tokens tokenizer)))
                  (cond ((and (or (null expected) (eq first-expected :eof))
                              (null (cdr matched)))
                         ;; This point is reached only once for each complete
                         ;; parsing.  The expected tokens and the input
                         ;; tokens have been exhausted at the same time.
                         ;; Hence we succeeded the parsing.
                         (setf (cdr matched) (list :eof))
                         (list nil nil))
                        ((null expected)
                         ;; Garbage at end of parsing.  This may mean that we
                         ;; have considered a production completed too soon.
                         (list (token-position (car matched))
                               #'(lambda ()
                                   "garbage at end of parsing")))
                        ((null (cdr matched))
                         ;; EOF error
                         (list :eof
                               #'(lambda ()
                                   (format nil "end of input expecting ~S" expected))))
                        (t ;; normal token
                         (let ((first-token (cadr matched)))
                           (if (match-token first-expected first-token)
                               (match (cdr expected) (cdr matched) #+debug depth)
                               ;; failed: we return the error
                               (list (token-position first-token)
                                     #'(lambda ()
                                         (format nil "expected ~S but got ~S ~S"
                                                 first-expected (token-type first-token)
                                                 (token-value first-token)))))))))))))
    (declare (inline match-token))
    (let ((result (list :head)))
      (destructuring-bind (error-position error)
          (match (list (find-rule start (grammar-rules grammar))) result #+debug 0)
        (when error-position
          (error "~A at ~A~%" (funcall (the function error)) error-position))
        (cadr result)))))

(defgeneric terminals-in-grammar (grammar-or-hashtable)
  (:documentation
   "Find non constant terminal symbols in GRAMMAR."))

(defmethod terminals-in-grammar ((grammar hash-table))
  (loop
     for rule being each hash-value of grammar
     with terminals = '()
     do (loop
           for prod in (rule-productions rule)
           do (loop
                 for tok in (prod-tokens prod)
                 when (symbolp tok)
                 do (pushnew tok terminals)))
     finally (return terminals)))

(defmethod terminals-in-grammar ((grammar grammar))
  (terminals-in-grammar (grammar-rules grammar)))

(defun print-grammar-figures (grammar &optional (stream *standard-output*))
  (format stream "rules: ~A~%constant terminals: ~A~%variable terminals: ~S~%"
          (hash-table-count (grammar-rules grammar))
          (hash-table-count (grammar-keywords grammar))
          (terminals-in-grammar (grammar-rules grammar))))


(defun grammar-keyword-p (keyword grammar)
  "Check if KEYWORD is part of this grammar."
  (find-keyword keyword (grammar-keywords grammar)))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(defvar *grammars* (make-hash-table))

(defun find-grammar (name)
  (gethash name *grammars*))

(defun delete-grammar (name)
  (remhash name *grammars*))

(defun add-grammar (name grammar)
  (setf (gethash name *grammars*) grammar))