C/C++ - Lexer scritto a mano

16/08/14 13:17

Sto scrivendo un lexer per un linguaggi di programmazione simile al c, ma sto avendo dei problemi strani. Vi posto i miei sorgenti:

Lexer.h

#ifndef __LEXER__
#define __LEXER__

#include "Token.h"

#include <malloc.h>
#include <string.h>
#include <vector>

using namespace std;

#define EMPTY_BUFFER -1
#define NULL_BUFFER -2
#define NO_ERRORS 0

enum LexReading {
	MultiLineComment,
	SingleLineComment,
	Number,
	String,
	Identifier,
	Source
};

class Lexer {
public:
	Lexer(char*);
	short Analyze(void);

	//lexer data
	char* buffer;
	unsigned long bufferLength;

	//lex result
	vector<Token> Tokens;
	vector<char*> Identifiers;
	vector<char*> Strings;
	vector<char*> Numbers;
	vector<char> Characters;

	//statistics
	unsigned long MultiLineFoundComments;
	unsigned long SingleLineFoundComments;
};

#endif

Lexer.cpp

#include "Lexer.h"

Lexer::Lexer(char* buffer)
{
	this->buffer = buffer;
	this->bufferLength = strlen(buffer);
	this->MultiLineFoundComments = 0L;
	this->SingleLineFoundComments = 0L;
}

short Lexer::Analyze(void)
{
	if (this->buffer == (char*)NULL)
		return NULL_BUFFER;
	else if (this->bufferLength == 0)
		return EMPTY_BUFFER;

	//what is the lexer reading?
	LexReading currentlyReading;

	unsigned long currentBufferCharacter = 0L; //the character that the leer will read
	vector<char> temp; //a temporary vector to store the characters of identifiers, numbers, strings and chars

	while (currentBufferCharacter < (this->bufferLength - 1))
	{
		switch (currentlyReading)
		{
			case MultiLineComment:
				//if this is the end of the comment
				if ((this->buffer[currentBufferCharacter] == '*') && (this->buffer[currentBufferCharacter + 1] == '/'))
				{
					currentlyReading = Source; //the lexer is going to read source code again
					this->MultiLineFoundComments++; //update the number of multi line comments lexed
					currentBufferCharacter++; //the lexer won't read the / character the next step
				}
				//else do nothing, I don't care about comments
				break;

			case SingleLineComment:
				//if this is the end of the line
				if (this->buffer[currentBufferCharacter] == '\n')
				{
					currentlyReading = Source; //the lexer is going to read source code again
					this->SingleLineFoundComments++; //update the number of single line comments lexed
				} //else do nothing, I don't care about comments
				break;

			case Number:
				if (((this->buffer[currentBufferCharacter] < 48) || (this->buffer[currentBufferCharacter] > 57)) && ((this->buffer[currentBufferCharacter] != '.') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'X') && (this->buffer[currentBufferCharacter] != 'B') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'x') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'a')&& (this->buffer[currentBufferCharacter] != 'A') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'C') && (this->buffer[currentBufferCharacter] != 'c') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'e') && (this->buffer[currentBufferCharacter] != 'E') && (this->buffer[currentBufferCharacter] != 'f') && (this->buffer[currentBufferCharacter] != 'F')))
				{
					size_t characters = temp.size();
					char* numberStringFromTemp = (char*)malloc((unsigned long long int)(sizeof(char) * characters + 1));
					numberStringFromTemp[characters - 1] = (char)0x00;
					numberStringFromTemp[characters] = (char)0x00;
					size_t currentChar;
					for (currentChar = 0L; currentChar < characters; currentChar++)
						numberStringFromTemp[currentChar] = temp[currentChar];
					this->Numbers.push_back(numberStringFromTemp);
					this->Tokens.push_back(TNUMBER);
					temp.clear();
					currentlyReading = Source;
				} else {
					temp.push_back(this->buffer[currentBufferCharacter]); //store the read character
				}
				break;

			case Source:
				//if this is the beginning of a single line comment
				if (this->buffer[currentBufferCharacter] == '#') //single line comments aren't C-like
				{
					currentlyReading = SingleLineComment; //the lexer is going to read a single line comment
				}
				else if ((this->buffer[currentBufferCharacter] == '/') && (this->buffer[currentBufferCharacter + 1] == '*')) //multi line comments are C-like
				{
					currentlyReading = MultiLineComment; //the lexer is going to read a multi line comment
					currentBufferCharacter++; //the lexer won't read the * simbol
				}
				else if ((this->buffer[currentBufferCharacter] >= 48) && (this->buffer[currentBufferCharacter] <= 57)) // ASCII code of 0 is 48 and of 9 is 57
				{
					currentlyReading = Number; //the lexer is going to read a number
					currentBufferCharacter--; //a little trick: i want the lexer to read this character again (when the lexer will expect a number)
				}
				break;

			default:
				break;
		}
		currentBufferCharacter++; //next time I'll read the next character
	}
	//the lexer's job is done
	return NO_ERRORS;
}

e qui arriva il bello.... Lo provo:

#include <stdio.h>
#include <string.h>
#include <vector>

#include "Lexer.h"

int main(int argc, char** argv)
{
	char* src = "/* sd */ # /*s5d*/\n/*cd*/#f\n55 6.5H\n5.9B /* 5.7 */ 5f\n";

	Lexer Analyzer(src);
	if (Analyzer.Analyze() != 0)
	{
		printf("Si e' verificato un errore!");
	} else {
		printf("Commenti multilinea: %u\nCommenti: %u\n", Analyzer.MultiLineFoundComments, Analyzer.SingleLineFoundComments );
	}

	size_t i;
	printf("Numero di numeri: %u\nNumeri: ", Analyzer.Numbers.size());
	for (i = 0; i < Analyzer.Numbers.size(); i++)
	{
		puts(Analyzer.Numbers[i]);
		printf("  ");
	}
	return 0;
}

E mi accorgo che c'è un problema:
5f non è nella lista dei numeri..... Tuttavia il lexer lo inserisce se dopo 5f aggiungo un altro carattere.
Qualcuno ha una idea sul perchè?

Utilizzare regex o generatori di lexer è una decisione che ho scartato e su cui non intendo tornare.

aaa

16/08/14 16:07

pierotofy

while (currentBufferCharacter < this->bufferLength)

Invece di:

while (currentBufferCharacter < (this->bufferLength - 1))

?

Così ad occhio, non l'ho testato.

Il mio blog: piero.dev

16/08/14 16:32

TheDarkJuster

Funziona perfettamente ora, grazie mille. Era rimasto quel -1 da una prova antecedente

aaa

16/08/14 17:14

pierotofy

Complimenti per l'impresa di scrivere un lexer a mano!

Il mio blog: piero.dev

16/08/14 18:25

TheDarkJuster

Imparare a usare le regex per me è peggio

aaa