467 lines
20 KiB
C++
467 lines
20 KiB
C++
/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
|
|
* Use of this file is governed by the BSD 3-clause license that
|
|
* can be found in the LICENSE.txt file in the project root.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "ANTLRErrorStrategy.h"
|
|
#include "misc/IntervalSet.h"
|
|
|
|
namespace antlr4 {
|
|
|
|
/**
|
|
* This is the default implementation of {@link ANTLRErrorStrategy} used for
|
|
* error reporting and recovery in ANTLR parsers.
|
|
*/
|
|
class ANTLR4CPP_PUBLIC DefaultErrorStrategy : public ANTLRErrorStrategy {
|
|
public:
|
|
DefaultErrorStrategy();
|
|
DefaultErrorStrategy(DefaultErrorStrategy const& other) = delete;
|
|
virtual ~DefaultErrorStrategy();
|
|
|
|
DefaultErrorStrategy& operator = (DefaultErrorStrategy const& other) = delete;
|
|
|
|
protected:
|
|
/**
|
|
* Indicates whether the error strategy is currently "recovering from an
|
|
* error". This is used to suppress reporting multiple error messages while
|
|
* attempting to recover from a detected syntax error.
|
|
*
|
|
* @see #inErrorRecoveryMode
|
|
*/
|
|
bool errorRecoveryMode;
|
|
|
|
/** The index into the input stream where the last error occurred.
|
|
* This is used to prevent infinite loops where an error is found
|
|
* but no token is consumed during recovery...another error is found,
|
|
* ad nauseum. This is a failsafe mechanism to guarantee that at least
|
|
* one token/tree node is consumed for two errors.
|
|
*/
|
|
int lastErrorIndex;
|
|
|
|
misc::IntervalSet lastErrorStates;
|
|
|
|
/// <summary>
|
|
/// {@inheritDoc}
|
|
/// <p/>
|
|
/// The default implementation simply calls <seealso cref="#endErrorCondition"/> to
|
|
/// ensure that the handler is not in error recovery mode.
|
|
/// </summary>
|
|
public:
|
|
virtual void reset(Parser *recognizer) override;
|
|
|
|
/// <summary>
|
|
/// This method is called to enter error recovery mode when a recognition
|
|
/// exception is reported.
|
|
/// </summary>
|
|
/// <param name="recognizer"> the parser instance </param>
|
|
protected:
|
|
virtual void beginErrorCondition(Parser *recognizer);
|
|
|
|
/// <summary>
|
|
/// {@inheritDoc}
|
|
/// </summary>
|
|
public:
|
|
virtual bool inErrorRecoveryMode(Parser *recognizer) override;
|
|
|
|
/// <summary>
|
|
/// This method is called to leave error recovery mode after recovering from
|
|
/// a recognition exception.
|
|
/// </summary>
|
|
/// <param name="recognizer"> </param>
|
|
protected:
|
|
virtual void endErrorCondition(Parser *recognizer);
|
|
|
|
/// <summary>
|
|
/// {@inheritDoc}
|
|
/// <p/>
|
|
/// The default implementation simply calls <seealso cref="#endErrorCondition"/>.
|
|
/// </summary>
|
|
public:
|
|
virtual void reportMatch(Parser *recognizer) override;
|
|
|
|
/// {@inheritDoc}
|
|
/// <p/>
|
|
/// The default implementation returns immediately if the handler is already
|
|
/// in error recovery mode. Otherwise, it calls <seealso cref="#beginErrorCondition"/>
|
|
/// and dispatches the reporting task based on the runtime type of {@code e}
|
|
/// according to the following table.
|
|
///
|
|
/// <ul>
|
|
/// <li><seealso cref="NoViableAltException"/>: Dispatches the call to
|
|
/// <seealso cref="#reportNoViableAlternative"/></li>
|
|
/// <li><seealso cref="InputMismatchException"/>: Dispatches the call to
|
|
/// <seealso cref="#reportInputMismatch"/></li>
|
|
/// <li><seealso cref="FailedPredicateException"/>: Dispatches the call to
|
|
/// <seealso cref="#reportFailedPredicate"/></li>
|
|
/// <li>All other types: calls <seealso cref="Parser#notifyErrorListeners"/> to report
|
|
/// the exception</li>
|
|
/// </ul>
|
|
virtual void reportError(Parser *recognizer, const RecognitionException &e) override;
|
|
|
|
/// <summary>
|
|
/// {@inheritDoc}
|
|
/// <p/>
|
|
/// The default implementation resynchronizes the parser by consuming tokens
|
|
/// until we find one in the resynchronization set--loosely the set of tokens
|
|
/// that can follow the current rule.
|
|
/// </summary>
|
|
virtual void recover(Parser *recognizer, std::exception_ptr e) override;
|
|
|
|
/**
|
|
* The default implementation of {@link ANTLRErrorStrategy#sync} makes sure
|
|
* that the current lookahead symbol is consistent with what were expecting
|
|
* at this point in the ATN. You can call this anytime but ANTLR only
|
|
* generates code to check before subrules/loops and each iteration.
|
|
*
|
|
* <p>Implements Jim Idle's magic sync mechanism in closures and optional
|
|
* subrules. E.g.,</p>
|
|
*
|
|
* <pre>
|
|
* a : sync ( stuff sync )* ;
|
|
* sync : {consume to what can follow sync} ;
|
|
* </pre>
|
|
*
|
|
* At the start of a sub rule upon error, {@link #sync} performs single
|
|
* token deletion, if possible. If it can't do that, it bails on the current
|
|
* rule and uses the default error recovery, which consumes until the
|
|
* resynchronization set of the current rule.
|
|
*
|
|
* <p>If the sub rule is optional ({@code (...)?}, {@code (...)*}, or block
|
|
* with an empty alternative), then the expected set includes what follows
|
|
* the subrule.</p>
|
|
*
|
|
* <p>During loop iteration, it consumes until it sees a token that can start a
|
|
* sub rule or what follows loop. Yes, that is pretty aggressive. We opt to
|
|
* stay in the loop as long as possible.</p>
|
|
*
|
|
* <p><strong>ORIGINS</strong></p>
|
|
*
|
|
* <p>Previous versions of ANTLR did a poor job of their recovery within loops.
|
|
* A single mismatch token or missing token would force the parser to bail
|
|
* out of the entire rules surrounding the loop. So, for rule</p>
|
|
*
|
|
* <pre>
|
|
* classDef : 'class' ID '{' member* '}'
|
|
* </pre>
|
|
*
|
|
* input with an extra token between members would force the parser to
|
|
* consume until it found the next class definition rather than the next
|
|
* member definition of the current class.
|
|
*
|
|
* <p>This functionality cost a little bit of effort because the parser has to
|
|
* compare token set at the start of the loop and at each iteration. If for
|
|
* some reason speed is suffering for you, you can turn off this
|
|
* functionality by simply overriding this method as a blank { }.</p>
|
|
*/
|
|
virtual void sync(Parser *recognizer) override;
|
|
|
|
/// <summary>
|
|
/// This is called by <seealso cref="#reportError"/> when the exception is a
|
|
/// <seealso cref="NoViableAltException"/>.
|
|
/// </summary>
|
|
/// <seealso cref= #reportError
|
|
/// </seealso>
|
|
/// <param name="recognizer"> the parser instance </param>
|
|
/// <param name="e"> the recognition exception </param>
|
|
protected:
|
|
virtual void reportNoViableAlternative(Parser *recognizer, const NoViableAltException &e);
|
|
|
|
/// <summary>
|
|
/// This is called by <seealso cref="#reportError"/> when the exception is an
|
|
/// <seealso cref="InputMismatchException"/>.
|
|
/// </summary>
|
|
/// <seealso cref= #reportError
|
|
/// </seealso>
|
|
/// <param name="recognizer"> the parser instance </param>
|
|
/// <param name="e"> the recognition exception </param>
|
|
virtual void reportInputMismatch(Parser *recognizer, const InputMismatchException &e);
|
|
|
|
/// <summary>
|
|
/// This is called by <seealso cref="#reportError"/> when the exception is a
|
|
/// <seealso cref="FailedPredicateException"/>.
|
|
/// </summary>
|
|
/// <seealso cref= #reportError
|
|
/// </seealso>
|
|
/// <param name="recognizer"> the parser instance </param>
|
|
/// <param name="e"> the recognition exception </param>
|
|
virtual void reportFailedPredicate(Parser *recognizer, const FailedPredicateException &e);
|
|
|
|
/**
|
|
* This method is called to report a syntax error which requires the removal
|
|
* of a token from the input stream. At the time this method is called, the
|
|
* erroneous symbol is current {@code LT(1)} symbol and has not yet been
|
|
* removed from the input stream. When this method returns,
|
|
* {@code recognizer} is in error recovery mode.
|
|
*
|
|
* <p>This method is called when {@link #singleTokenDeletion} identifies
|
|
* single-token deletion as a viable recovery strategy for a mismatched
|
|
* input error.</p>
|
|
*
|
|
* <p>The default implementation simply returns if the handler is already in
|
|
* error recovery mode. Otherwise, it calls {@link #beginErrorCondition} to
|
|
* enter error recovery mode, followed by calling
|
|
* {@link Parser#notifyErrorListeners}.</p>
|
|
*
|
|
* @param recognizer the parser instance
|
|
*/
|
|
virtual void reportUnwantedToken(Parser *recognizer);
|
|
|
|
/**
|
|
* This method is called to report a syntax error which requires the
|
|
* insertion of a missing token into the input stream. At the time this
|
|
* method is called, the missing token has not yet been inserted. When this
|
|
* method returns, {@code recognizer} is in error recovery mode.
|
|
*
|
|
* <p>This method is called when {@link #singleTokenInsertion} identifies
|
|
* single-token insertion as a viable recovery strategy for a mismatched
|
|
* input error.</p>
|
|
*
|
|
* <p>The default implementation simply returns if the handler is already in
|
|
* error recovery mode. Otherwise, it calls {@link #beginErrorCondition} to
|
|
* enter error recovery mode, followed by calling
|
|
* {@link Parser#notifyErrorListeners}.</p>
|
|
*
|
|
* @param recognizer the parser instance
|
|
*/
|
|
virtual void reportMissingToken(Parser *recognizer);
|
|
|
|
public:
|
|
/**
|
|
* {@inheritDoc}
|
|
*
|
|
* <p>The default implementation attempts to recover from the mismatched input
|
|
* by using single token insertion and deletion as described below. If the
|
|
* recovery attempt fails, this method throws an
|
|
* {@link InputMismatchException}.</p>
|
|
*
|
|
* <p><strong>EXTRA TOKEN</strong> (single token deletion)</p>
|
|
*
|
|
* <p>{@code LA(1)} is not what we are looking for. If {@code LA(2)} has the
|
|
* right token, however, then assume {@code LA(1)} is some extra spurious
|
|
* token and delete it. Then consume and return the next token (which was
|
|
* the {@code LA(2)} token) as the successful result of the match operation.</p>
|
|
*
|
|
* <p>This recovery strategy is implemented by {@link #singleTokenDeletion}.</p>
|
|
*
|
|
* <p><strong>MISSING TOKEN</strong> (single token insertion)</p>
|
|
*
|
|
* <p>If current token (at {@code LA(1)}) is consistent with what could come
|
|
* after the expected {@code LA(1)} token, then assume the token is missing
|
|
* and use the parser's {@link TokenFactory} to create it on the fly. The
|
|
* "insertion" is performed by returning the created token as the successful
|
|
* result of the match operation.</p>
|
|
*
|
|
* <p>This recovery strategy is implemented by {@link #singleTokenInsertion}.</p>
|
|
*
|
|
* <p><strong>EXAMPLE</strong></p>
|
|
*
|
|
* <p>For example, Input {@code i=(3;} is clearly missing the {@code ')'}. When
|
|
* the parser returns from the nested call to {@code expr}, it will have
|
|
* call chain:</p>
|
|
*
|
|
* <pre>
|
|
* stat → expr → atom
|
|
* </pre>
|
|
*
|
|
* and it will be trying to match the {@code ')'} at this point in the
|
|
* derivation:
|
|
*
|
|
* <pre>
|
|
* => ID '=' '(' INT ')' ('+' atom)* ';'
|
|
* ^
|
|
* </pre>
|
|
*
|
|
* The attempt to match {@code ')'} will fail when it sees {@code ';'} and
|
|
* call {@link #recoverInline}. To recover, it sees that {@code LA(1)==';'}
|
|
* is in the set of tokens that can follow the {@code ')'} token reference
|
|
* in rule {@code atom}. It can assume that you forgot the {@code ')'}.
|
|
*/
|
|
virtual Token* recoverInline(Parser *recognizer) override;
|
|
|
|
/// <summary>
|
|
/// This method implements the single-token insertion inline error recovery
|
|
/// strategy. It is called by <seealso cref="#recoverInline"/> if the single-token
|
|
/// deletion strategy fails to recover from the mismatched input. If this
|
|
/// method returns {@code true}, {@code recognizer} will be in error recovery
|
|
/// mode.
|
|
/// <p/>
|
|
/// This method determines whether or not single-token insertion is viable by
|
|
/// checking if the {@code LA(1)} input symbol could be successfully matched
|
|
/// if it were instead the {@code LA(2)} symbol. If this method returns
|
|
/// {@code true}, the caller is responsible for creating and inserting a
|
|
/// token with the correct type to produce this behavior.
|
|
/// </summary>
|
|
/// <param name="recognizer"> the parser instance </param>
|
|
/// <returns> {@code true} if single-token insertion is a viable recovery
|
|
/// strategy for the current mismatched input, otherwise {@code false} </returns>
|
|
protected:
|
|
virtual bool singleTokenInsertion(Parser *recognizer);
|
|
|
|
/// <summary>
|
|
/// This method implements the single-token deletion inline error recovery
|
|
/// strategy. It is called by <seealso cref="#recoverInline"/> to attempt to recover
|
|
/// from mismatched input. If this method returns null, the parser and error
|
|
/// handler state will not have changed. If this method returns non-null,
|
|
/// {@code recognizer} will <em>not</em> be in error recovery mode since the
|
|
/// returned token was a successful match.
|
|
/// <p/>
|
|
/// If the single-token deletion is successful, this method calls
|
|
/// <seealso cref="#reportUnwantedToken"/> to report the error, followed by
|
|
/// <seealso cref="Parser#consume"/> to actually "delete" the extraneous token. Then,
|
|
/// before returning <seealso cref="#reportMatch"/> is called to signal a successful
|
|
/// match.
|
|
/// </summary>
|
|
/// <param name="recognizer"> the parser instance </param>
|
|
/// <returns> the successfully matched <seealso cref="Token"/> instance if single-token
|
|
/// deletion successfully recovers from the mismatched input, otherwise
|
|
/// {@code null} </returns>
|
|
virtual Token* singleTokenDeletion(Parser *recognizer);
|
|
|
|
/// <summary>
|
|
/// Conjure up a missing token during error recovery.
|
|
///
|
|
/// The recognizer attempts to recover from single missing
|
|
/// symbols. But, actions might refer to that missing symbol.
|
|
/// For example, x=ID {f($x);}. The action clearly assumes
|
|
/// that there has been an identifier matched previously and that
|
|
/// $x points at that token. If that token is missing, but
|
|
/// the next token in the stream is what we want we assume that
|
|
/// this token is missing and we keep going. Because we
|
|
/// have to return some token to replace the missing token,
|
|
/// we have to conjure one up. This method gives the user control
|
|
/// over the tokens returned for missing tokens. Mostly,
|
|
/// you will want to create something special for identifier
|
|
/// tokens. For literals such as '{' and ',', the default
|
|
/// action in the parser or tree parser works. It simply creates
|
|
/// a CommonToken of the appropriate type. The text will be the token.
|
|
/// If you change what tokens must be created by the lexer,
|
|
/// override this method to create the appropriate tokens.
|
|
/// </summary>
|
|
virtual Token* getMissingSymbol(Parser *recognizer);
|
|
|
|
virtual misc::IntervalSet getExpectedTokens(Parser *recognizer);
|
|
|
|
/// <summary>
|
|
/// How should a token be displayed in an error message? The default
|
|
/// is to display just the text, but during development you might
|
|
/// want to have a lot of information spit out. Override in that case
|
|
/// to use t.toString() (which, for CommonToken, dumps everything about
|
|
/// the token). This is better than forcing you to override a method in
|
|
/// your token objects because you don't have to go modify your lexer
|
|
/// so that it creates a new class.
|
|
/// </summary>
|
|
virtual std::string getTokenErrorDisplay(Token *t);
|
|
|
|
virtual std::string getSymbolText(Token *symbol);
|
|
|
|
virtual size_t getSymbolType(Token *symbol);
|
|
|
|
virtual std::string escapeWSAndQuote(const std::string &s) const;
|
|
|
|
/* Compute the error recovery set for the current rule. During
|
|
* rule invocation, the parser pushes the set of tokens that can
|
|
* follow that rule reference on the stack; this amounts to
|
|
* computing FIRST of what follows the rule reference in the
|
|
* enclosing rule. See LinearApproximator.FIRST().
|
|
* This local follow set only includes tokens
|
|
* from within the rule; i.e., the FIRST computation done by
|
|
* ANTLR stops at the end of a rule.
|
|
*
|
|
* EXAMPLE
|
|
*
|
|
* When you find a "no viable alt exception", the input is not
|
|
* consistent with any of the alternatives for rule r. The best
|
|
* thing to do is to consume tokens until you see something that
|
|
* can legally follow a call to r *or* any rule that called r.
|
|
* You don't want the exact set of viable next tokens because the
|
|
* input might just be missing a token--you might consume the
|
|
* rest of the input looking for one of the missing tokens.
|
|
*
|
|
* Consider grammar:
|
|
*
|
|
* a : '[' b ']'
|
|
* | '(' b ')'
|
|
* ;
|
|
* b : c '^' INT ;
|
|
* c : ID
|
|
* | INT
|
|
* ;
|
|
*
|
|
* At each rule invocation, the set of tokens that could follow
|
|
* that rule is pushed on a stack. Here are the various
|
|
* context-sensitive follow sets:
|
|
*
|
|
* FOLLOW(b1_in_a) = FIRST(']') = ']'
|
|
* FOLLOW(b2_in_a) = FIRST(')') = ')'
|
|
* FOLLOW(c_in_b) = FIRST('^') = '^'
|
|
*
|
|
* Upon erroneous input "[]", the call chain is
|
|
*
|
|
* a -> b -> c
|
|
*
|
|
* and, hence, the follow context stack is:
|
|
*
|
|
* depth follow set start of rule execution
|
|
* 0 <EOF> a (from main())
|
|
* 1 ']' b
|
|
* 2 '^' c
|
|
*
|
|
* Notice that ')' is not included, because b would have to have
|
|
* been called from a different context in rule a for ')' to be
|
|
* included.
|
|
*
|
|
* For error recovery, we cannot consider FOLLOW(c)
|
|
* (context-sensitive or otherwise). We need the combined set of
|
|
* all context-sensitive FOLLOW sets--the set of all tokens that
|
|
* could follow any reference in the call chain. We need to
|
|
* resync to one of those tokens. Note that FOLLOW(c)='^' and if
|
|
* we resync'd to that token, we'd consume until EOF. We need to
|
|
* sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
|
|
* In this case, for input "[]", LA(1) is ']' and in the set, so we would
|
|
* not consume anything. After printing an error, rule c would
|
|
* return normally. Rule b would not find the required '^' though.
|
|
* At this point, it gets a mismatched token error and throws an
|
|
* exception (since LA(1) is not in the viable following token
|
|
* set). The rule exception handler tries to recover, but finds
|
|
* the same recovery set and doesn't consume anything. Rule b
|
|
* exits normally returning to rule a. Now it finds the ']' (and
|
|
* with the successful match exits errorRecovery mode).
|
|
*
|
|
* So, you can see that the parser walks up the call chain looking
|
|
* for the token that was a member of the recovery set.
|
|
*
|
|
* Errors are not generated in errorRecovery mode.
|
|
*
|
|
* ANTLR's error recovery mechanism is based upon original ideas:
|
|
*
|
|
* "Algorithms + Data Structures = Programs" by Niklaus Wirth
|
|
*
|
|
* and
|
|
*
|
|
* "A note on error recovery in recursive descent parsers":
|
|
* http://portal.acm.org/citation.cfm?id=947902.947905
|
|
*
|
|
* Later, Josef Grosch had some good ideas:
|
|
*
|
|
* "Efficient and Comfortable Error Recovery in Recursive Descent
|
|
* Parsers":
|
|
* ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
|
|
*
|
|
* Like Grosch I implement context-sensitive FOLLOW sets that are combined
|
|
* at run-time upon error to avoid overhead during parsing.
|
|
*/
|
|
virtual misc::IntervalSet getErrorRecoverySet(Parser *recognizer);
|
|
|
|
/// <summary>
|
|
/// Consume tokens until one matches the given token set. </summary>
|
|
virtual void consumeUntil(Parser *recognizer, const misc::IntervalSet &set);
|
|
|
|
private:
|
|
std::vector<std::unique_ptr<Token>> _errorSymbols; // Temporarily created token.
|
|
void InitializeInstanceFields();
|
|
};
|
|
|
|
} // namespace antlr4
|