slcom/slangc/parser/Scan.sauce

package slangc.parser;

public class Scan {
	private final Language language;
	private final Source source;
    private final SourceEncoding encoding;
	private Token[] tokens = new Token[1000];
    int ntokens = 0;

    public void reset() {
        tokens = new Token[20];
        ntokens = 0;
    }

	public Scan(Language language, Source source) {
		this.language = language;
		this.source = source;
        encoding = source.getEncoding();
        //Log.line("Encoding=" + source.getEncoding());
	}

	public Language getLanguage() {
		return language;
	}

	public Source getSource() {
		return source;
	}

	public int countTokens() {
		return ntokens; //tokens.length;
	}

	public Token getToken(int index) {
		if (index < 0 || index >= ntokens /*tokens.length*/) {
			return null;
		} else {
			return tokens[index];
		}
	}

	public int scanAll() {
		SourceWalker w = new SourceWalker(source);
		Token t;
		int i = 0;
		do {
			t = tokenAt(w);
			append(t);
			i++;
		} while (!t.is(TokenType.END_OF_FILE));
		return i;
	}

    boolean nullifyComments = false;
	public int scanQuick() {
        nullifyComments = true;
		SourceWalker w = new SourceWalker(source);
		Token t;
		int i = 0;
		do {
			t = tokenAt(w);
			if (t == null || t.is(TokenType.LINE_COMMENT) || t.is(TokenType.LONG_COMMENT)) {
				// Ignore these
			} else {
				append(t);
				i++;
			}
		} while (t == null || !t.is(TokenType.END_OF_FILE));
		return i;
	}

	public int scanPedantic() {
        nullifyComments = false;
		SourceWalker w = new SourceWalker(source);
		Token t;
		Token precedingNonComment = null;
		Token[] precedingComments = new Token[] {};
		int i = 0;
		do {
			t = tokenAt(w);
			if (t.is(TokenType.LINE_COMMENT) || t.is(TokenType.LONG_COMMENT)) {
				if (precedingNonComment != null && precedingNonComment.getSnippet().getStart().getLineCount() == t.getSnippet().getStart().getLineCount()) {
					precedingNonComment.annotate(new CommentAnnotation(t));
				} else {
					Token[] nprecoms = new Token[precedingComments.length + 1];
					for (int j = 0; j < precedingComments.length; j++) {
						nprecoms[j] = precedingComments[j];
					}
					nprecoms[nprecoms.length - 1] = t;
					precedingComments = nprecoms;
				}
			} else {
				precedingNonComment = t;
				if (precedingComments.length != 0) {
					for (int j = 0; j < precedingComments.length; j++) {
						precedingNonComment.annotate(new CommentAnnotation(precedingComments[j]));
					}
					precedingComments = new Token[] {};
				}
				append(t);
				i++;
			}
		} while (!t.is(TokenType.END_OF_FILE));
		return i;
	}

	public void append(Token t) {
        if (ntokens >= tokens.length) {
            Token[] narr = new Token[tokens.length * 2 + 1];
            for (int i = 0; i < tokens.length; i++) {
                narr[i] = tokens[i];
            }
            tokens = narr;
        }
        tokens[ntokens] = t;
        t.setOwner(this);
        ntokens++;
        /*
		Token[] ntokens = new Token[tokens.length + 1];
		for (int i = 0; i < tokens.length; i++) {
			ntokens[i] = tokens[i];
		}
		ntokens[ntokens.length - 1] = t;
		t.setOwner(this);
		tokens = ntokens;
        */
	}

	public /*static */void skipSpaces(SourceWalker walker) {
		while (!walker.isAtEnd() && encoding.isSpace(walker.getSource().getCharacter(walker.getIndex()))) {
			walker.advance();
		}
	}

	public Token tokenAt(SourceWalker walker) {
		Token t = innerTokenAt(walker);
		if (t != null) {
			t.setOwner(this);
		}
		return t;
	}
	public Token innerTokenAt(SourceWalker walker) {
		//assert walker.getSource() == source;
		//position = position.clone();
		skipSpaces(walker);
		if (walker.isAtEnd()) {
			return new Token(TokenType.END_OF_FILE, new SourceSnippet(walker.getPosition(), 0));
		}

		if (source.matches(walker.getIndex(), language.getLineCommentPrefixes())) {
			SourcePosition startPosition = walker.getPosition();

			int i = 0;
			while (!walker.isAtEnd() && !encoding.isNewline(walker.getSource().getCharacter(walker.getIndex()))) {
				walker.advance();
				i++;
			}

            if (nullifyComments) return null;
			return new Token(TokenType.LINE_COMMENT, new SourceSnippet(startPosition, i));
		} else if (source.matches(walker.getIndex(), language.getLongCommentTerminators())) {
			SourcePosition startPosition = walker.getPosition();
			String end = source.match(walker.getIndex(), language.getLongCommentTerminators());

			int i = 0;
			while (!walker.isAtEnd() && !source.matches(walker.getIndex(), end)) {
				walker.advance();
				i++;
			}

			boolean unterminated = false;

			if (walker.isAtEnd()) {
				unterminated = true;
			} else {
				walker.advance(end.intsLength());
				i += end.intsLength();
			}
			if (nullifyComments && !unterminated) return null;
			Token result = new Token(TokenType.LONG_COMMENT, new SourceSnippet(startPosition, i));
			if (unterminated) {
				result.annotate(ErrorType.UNTERMINATED_TOKEN);
			}
			return result;
		} else if (source.matches(walker.getIndex(), language.getStringTerminators())) {
			SourcePosition startPosition = walker.getPosition();
			String end = source.match(walker.getIndex(), language.getStringTerminators());

			walker.advance(); // Must read at least the quote or it will likely match as end too!
			int i = 1;

			while (!walker.isAtEnd() && !source.matches(walker.getIndex(), end) && !source.matches(walker.getIndex(), language.getStringForbiddenEscapes())) {
				if (source.matches(walker.getIndex(), language.getStringEscapes())) {
					walker.advance();
					i++;
				}
				walker.advance();
				i++;
			}

			boolean unterminated = false;
			boolean badEscape = false;

			if (walker.isAtEnd()) {
				unterminated = true;
			} else if (source.matches(walker.getIndex(), language.getStringForbiddenEscapes())) {
				badEscape = true;
			} else {
				walker.advance(end.intsLength());
				i += end.intsLength();
			}

			Token result = new Token(TokenType.STRING, new SourceSnippet(startPosition, i));
			if (unterminated) {
				result.annotate(ErrorType.UNTERMINATED_TOKEN);
			}
			if (badEscape) {
				result.annotate(ErrorType.BAD_ESCAPE);
			}
			return result;
		} else if (source.matches(walker.getIndex(), language.getCharTerminators())) {
			SourcePosition startPosition = walker.getPosition();
			String end = source.match(walker.getIndex(), language.getCharTerminators());

			walker.advance(); // Must read at least the quote or it will likely match as end too!
			int i = 1;

			while (!walker.isAtEnd() && !source.matches(walker.getIndex(), end) /*&& !sourceFile.matches(walker.getIndex(), language.getStringForbiddenEscapes())*/) {
				if (source.matches(walker.getIndex(), language.getStringEscapes())) {
					walker.advance();
					i++;
				}
				walker.advance();
				i++;
			}

			boolean unterminated = false;
			boolean badEscape = false;

			if (walker.isAtEnd()) {
				unterminated = true;
			} else if (source.matches(walker.getIndex(), language.getCharForbiddenEscapes())) {
				badEscape = true;
			} else {
				walker.advance(end.intsLength());
				i += end.intsLength();
			}

			Token result = new Token(TokenType.CHAR, new SourceSnippet(startPosition, i));
			if (unterminated) {
				result.annotate(ErrorType.UNTERMINATED_TOKEN);
			}
			if (badEscape) {
				result.annotate(ErrorType.BAD_ESCAPE);
			}
			return result;
		} /*else if (sourceFile.matches(walker.getIndex(), language.getHexPrefixes())) {
			SourcePosition startPosition = walker.getPosition();

			String prefix = sourceFile.match(walker.getIndex(), language.getHexPrefixes());

			int i = prefix.intsLength();
			walker.advance(i);

			while (encoding.isDigitOrUnderscore(sourceFile.getCharacter(walker.getIndex()), 16)) {
				walker.advance();
				i++;
			}

			while (encoding.isIntegerSuffix(sourceFile.getCharacter(walker.getIndex()))) {
				walker.advance();
				i++;
			}

			return new Token(TokenType.HEX_INTEGER, new SourceSnippet(startPosition, i));
		}*/ else if (source.matches(walker.getIndex(), language.getHexPrefixes())
				|| source.matches(walker.getIndex(), language.getBinPrefixes())
				|| encoding.isDigit(source.getCharacter(walker.getIndex()))
				// Also check obscure case where float starts with a dot
				 || (encoding.isDecimalPoint(source.getCharacter(walker.getIndex())) && encoding.isDigit(source.getCharacter(walker.getIndex() + 1)))) {
			SourcePosition startPosition = walker.getPosition();
			String prefix = "";
			int i = 0;
			boolean isFloat = false;
			int radix = 10;

			if (source.matches(walker.getIndex(), language.getHexPrefixes())) {
				prefix = source.match(walker.getIndex(), language.getHexPrefixes());

				i = prefix.intsLength();
				walker.advance(i);

				radix = 16;
			} else if (source.matches(walker.getIndex(), language.getBinPrefixes())) {
				prefix = source.match(walker.getIndex(), language.getBinPrefixes());

				i = prefix.intsLength();
				walker.advance(i);

				radix = 2;
			} else {
				if (encoding.isDecimalPoint(source.getCharacter(walker.getIndex()))) {
					isFloat = true;
				}
				walker.advance();
				i = 1;
			}

			while (encoding.isDigitOrUnderscore(source.getCharacter(walker.getIndex()), radix)) {
				walker.advance();
				i++;
			}

			if (encoding.isDecimalPoint(source.getCharacter(walker.getIndex()))) {
				isFloat = true;
				walker.advance();
				i++;
				while (encoding.isDigitOrUnderscore(source.getCharacter(walker.getIndex()), radix)) {
					walker.advance();
					i++;
				}
			}

			if (encoding.isExponentSymbol(source.getCharacter(walker.getIndex()))) {
				isFloat = true; // Allow floats to be made using exponent alone without decimal point
				walker.advance();
				i++;
				if (encoding.isSign(source.getCharacter(walker.getIndex()))) {
					walker.advance();
					i++;
				}
				while (encoding.isDigitOrUnderscore(source.getCharacter(walker.getIndex()), radix)) {
					walker.advance();
					i++;
				}
			}

			while (encoding.isFloatSuffix(source.getCharacter(walker.getIndex()))) {
				isFloat = true; // Allow floats to be made just by adding a float suffix
				walker.advance();
				i++;
			}

			if (!isFloat) {
				while (encoding.isIntegerSuffix(source.getCharacter(walker.getIndex()))) {
					walker.advance();
					i++;
				}
			}

			TokenType t;
			if (isFloat) {
				switch(radix) {
				case 2:
					t = TokenType.BIN_FLOAT;
					break;
				case 10:
					t = TokenType.DEC_FLOAT;
					break;
				case 16:
					t = TokenType.HEX_FLOAT;
					break;
				default:
					throw new Error("INTERNAL ERROR: Bad base number: " + radix);
				}
			} else {
				switch(radix) {
				case 2:
					t = TokenType.BIN_INTEGER;
					break;
				case 10:
					t = TokenType.DEC_INTEGER;
					break;
				case 16:
					t = TokenType.HEX_INTEGER;
					break;
				default:
					throw new Error("INTERNAL ERROR: Bad base number: " + radix);
				}
			}

			return new Token(t, new SourceSnippet(startPosition, i));
		} else if (source.matches(walker.getIndex(), language.getOperators())) {
			SourcePosition startPosition = walker.getPosition();

			String op = source.match(walker.getIndex(), language.getOperators());

			walker.advance(op.intsLength());

			return new Token(TokenType.OPERATOR, new SourceSnippet(startPosition, op.intsLength()));
		} else if (encoding.isValidNameHead(source.getCharacter(walker.getIndex()))) {
			SourcePosition startPosition = walker.getPosition();

			walker.advance();
			int i = 1;

            int c;

			while (((c = source.getCharacter(walker.getIndex())) >= 'a' && c <= 'z') || encoding.isValidNameTail(c)) {
				walker.advance();
				i++;
			}

			SourceSnippet s = new SourceSnippet(startPosition, i);
			String n = s.getSource();

			String[] kws = language.getKeywords();
			for (int j = 0; j < kws.length; j++) {
				if (n == kws[j]) {
					return new Token(TokenType.KEYWORD, s);
				}
			}
			return new Token(TokenType.NAME, s);
		} else {
			SourcePosition startPosition = walker.getPosition();

            //throw new Error("Unexpected character #" + startPosition.getSource().getCharacter(startPosition.getIndex()));

			walker.advance();
			return new Token(TokenType.UNEXPECTED_CHARACTER, new SourceSnippet(startPosition, 1));
		}
	}
}