C/C++ - Lexer, modifiche e malfunzionamenti molesti

21/08/14 20:28

L' altra sera ero felice che il mio lexer funzionasse bene, così ho deciso di modificare il lexer: vector<Token> è diventato vector<Token>*. La modifica ha avuto successo, così ho deciso di memorizzare nei token anche il numero di carattere nella riga corrispondente a quel token e qui sono cominciati i problemi: lancio l' esecuzione del programma e mi viene stampato a video numero commenti: 0 e numero commenti multiriga: 0 e non c'è nessun token nel vettore. Non riesco a capire cosa non va, aiuto.

/*
 * Lexer.h
 *
 *  Created on: 14/08/2014
 *      Author: Denis
 */

#ifndef __LEXER__
#define __LEXER__

#include "Token.h"

#include <string.h>
#include <vector>
#include <memory>

using namespace std;

#define EMPTY_BUFFER -1
#define NULL_BUFFER -2
#define NO_ERRORS 0

enum LexReading {
	MultiLineComment,
	SingleLineComment,
	Number,
	String,
	Identifier,
	Source
};

class Lexer {
public:
	Lexer(char*, vector<Token>*);
	short Analyze(void);

	//lexer data
	char* buffer;
	unsigned long bufferLength;

	//lex result
	vector<Token>* Tokens;

	//statistics
	unsigned long MultiLineFoundComments;
	unsigned long SingleLineFoundComments;
};

#endif /* __LEXER__ */

#include "Lexer.h"

Lexer::Lexer(char* buffer, vector<Token>* tokens)
{
	this->buffer = buffer; //save the buffer pointer
	this->bufferLength = strlen(buffer); //save the buffer length
	this->Tokens = tokens; //save the pointer

	//setup everything
	this->MultiLineFoundComments = 0L;
	this->SingleLineFoundComments = 0L;
}

short Lexer::Analyze(void)
{
	//check if the buffer is null or empty and return the proper error if so
	if (this->buffer == (char*)NULL)
		return NULL_BUFFER;
	else if (this->bufferLength == 0)
		return EMPTY_BUFFER;

	//what is the lexer reading?
	LexReading currentlyReading;

	unsigned long currentBufferCharacter = 0L; //the character that the lexer will read
	unsigned long currentBufferRow = 1L; //the user will know the exact line of an error (or warning) if any
	unsigned long currentBufferColumn = 1L; //the user will know the exact line of an error (or warning) if any
	vector<char> temp; //a temporary vector to store the characters of identifiers, numbers, strings and chars

	while (currentBufferCharacter < this->bufferLength)
	{
		switch (currentlyReading)
		{
			case MultiLineComment:
				//if this is the end of the comment
				if ((this->buffer[currentBufferCharacter] == '*') && (this->buffer[currentBufferCharacter + 1] == '/'))
				{
					currentlyReading = Source; //the lexer is going to read source code again
					this->MultiLineFoundComments++; //update the number of multi line comments lexed
					currentBufferCharacter++; //the lexer won't read the / character the next step
				} else if (this->buffer[currentBufferCharacter] == '\n') {
					currentBufferRow++; //update the row number
					currentBufferColumn = 1L; //update the column number
				}
				//else do nothing, I don't care about comments
				break;

			case SingleLineComment:
				//if this is the end of the line
				if (this->buffer[currentBufferCharacter] == '\n')
				{
					currentlyReading = Source; //the lexer is going to read source code again
					this->SingleLineFoundComments++; //update the number of single line comments lexed
					currentBufferRow++; //update the row number
					currentBufferColumn = 1L; //update the column number
				}
				//else do nothing, I don't care about comments
				break;

			case Number:
				//if this is the end of the number
				if (((this->buffer[currentBufferCharacter] < 48) || (this->buffer[currentBufferCharacter] > 57)) && ((this->buffer[currentBufferCharacter] != '.') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'X') && (this->buffer[currentBufferCharacter] != 'B') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'x') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'a')&& (this->buffer[currentBufferCharacter] != 'A') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'C') && (this->buffer[currentBufferCharacter] != 'c') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'e') && (this->buffer[currentBufferCharacter] != 'E') && (this->buffer[currentBufferCharacter] != 'f') && (this->buffer[currentBufferCharacter] != 'F')))
				{ //then save the number stored in temp
					size_t characters = temp.size();
					char* numberStringFromTemp = new char[characters + 1];
					numberStringFromTemp[characters] = (char)0x00;
					size_t currentChar;
					for (currentChar = 0L; currentChar < characters; currentChar++)
						numberStringFromTemp[currentChar] = temp[currentChar];
					//create the token
					Token numberToken;
					numberToken.type = TNUMBER;
					numberToken.row = currentBufferRow;
					numberToken.column = currentBufferColumn;
					numberToken.data = numberStringFromTemp;
					this->Tokens->push_back(numberToken); //save the new token
					temp.clear(); //clear the temp buffer after having saved the number stored in
					currentlyReading = Source; //prepare the lexer for the next character
					currentBufferCharacter--; //the lexer have to re-read the current character
				} else { //else save the character to the temp
					temp.push_back(this->buffer[currentBufferCharacter]); //store the read character
				}
				break;

			case String:
				//if this is the end of a string
				if (this->buffer[currentBufferCharacter] == '"')
				{ //then save the string stored in temp
					size_t characters = temp.size();
					char* stringFromTemp = new char[characters + 1];
					stringFromTemp[characters] = (char)0x00;
					size_t currentChar;
					for (currentChar = 0L; currentChar < characters; currentChar++)
						stringFromTemp[currentChar] = temp[currentChar];
					//create the token
					Token stringToken;
					stringToken.type = TSTRING;
					stringToken.row = currentBufferRow;
					stringToken.column = currentBufferColumn;
					stringToken.data = stringFromTemp;
					this->Tokens->push_back(stringToken); //save the new token
					temp.clear(); //clear the temp buffer after having saved the string stored in
					currentlyReading = Source; //prepare the lexer for the next character
				} else if (this->buffer[currentBufferCharacter] == '\n') {
					temp.push_back('\'); //store the read character
					temp.push_back('n'); //as it should be stored
					currentBufferRow++; //update the row number
					currentBufferColumn = 1L; //update the column number

					/* FUCK YOU! STUPID USER! */

				} else if (this->buffer[currentBufferCharacter] == '\t') {
					temp.push_back('\'); //store the read character
					temp.push_back('t'); //as it should be stored

					/* FUCK YOU! STUPID USER! */

				} else if (this->buffer[currentBufferCharacter] == '\b') {
					temp.push_back('\'); //store the read character
					temp.push_back('b'); //as it should be stored

					/* FUCK YOU! STUPID USER! */

				} else if (this->buffer[currentBufferCharacter] == '\f') {
					temp.push_back('\'); //store the read character
					temp.push_back('f'); //as it should be stored

					/* FUCK YOU! STUPID USER! */

				} else if (this->buffer[currentBufferCharacter] == '\v') {
					temp.push_back('\'); //store the read character
					temp.push_back('v'); //as it should be stored

					/* FUCK YOU! STUPID USER! */

				} else if (this->buffer[currentBufferCharacter] == '\r') {
					temp.push_back('\'); //store the read character
					temp.push_back('r'); //as it should be stored

					/* FUCK YOU! STUPID USER! */

				} else if (this->buffer[currentBufferCharacter] == '\?') {
					temp.push_back('\'); //store the read character
					temp.push_back('?'); //as it should be stored

					/* FUCK YOU! STUPID USER! */

				} else { //else save the character to the temp
					temp.push_back(this->buffer[currentBufferCharacter]); //store the read character
				}
				break;

			case Identifier:
				//if this is a character that is part of an identifier
				if (((this->buffer[currentBufferCharacter] >= 65) && (this->buffer[currentBufferCharacter] <= 90)) || ((this->buffer[currentBufferCharacter] >= 97) && (this->buffer[currentBufferCharacter] <= 122)) || (this->buffer[currentBufferCharacter] == '_'))
				{
					temp.push_back(this->buffer[currentBufferCharacter]); //store the read character
				} else { //save the string stored in temp
					size_t characters = temp.size();
					char* identifierStringFromTemp = new char[characters + 1];
					identifierStringFromTemp[characters] = (char)0x00;
					size_t currentChar;
					for (currentChar = 0L; currentChar < characters; currentChar++)
						identifierStringFromTemp[currentChar] = temp[currentChar];
					//create the token
					Token identifierToken;
					identifierToken.type = TIDENTIFIER;
					identifierToken.row = currentBufferRow;
					identifierToken.column = currentBufferColumn;
					identifierToken.data = identifierStringFromTemp;
					this->Tokens->push_back(identifierToken); //save the new token
					temp.clear(); //clear the temp buffer after having saved the identifier stored in
					currentlyReading = Source; //prepare the lexer for the next character
					currentBufferCharacter--; //the lexer have to re-read the current character
				}
				break;

			case Source:
				if (this->buffer[currentBufferCharacter] == '\n')
				{
					currentBufferRow++; //update the row number
					currentBufferColumn = 1L; //update the column number
				}
				else if (this->buffer[currentBufferCharacter] == '#') //single line comments aren't C-like
				{
					currentlyReading = SingleLineComment; //the lexer is going to read a single line comment
				}
				else if ((this->buffer[currentBufferCharacter] == '/') && (this->buffer[currentBufferCharacter + 1] == '*')) //multi line comments are C-like
				{
					currentlyReading = MultiLineComment; //the lexer is going to read a multi line comment
					currentBufferCharacter++; //the lexer won't read the * simbol
				}
				else if (this->buffer[currentBufferCharacter] == '"') //start of a string
				{
					currentlyReading = String; //the lexer is going to read a string
				}
				else if ((this->buffer[currentBufferCharacter] >= 48) && (this->buffer[currentBufferCharacter] <= 57)) // ASCII code of 0 is 48 and of 9 is 57
				{
					currentlyReading = Number; //the lexer is going to read a number
					currentBufferCharacter--; //a little trick: i want the lexer to read this character again (when the lexer will expect a number)
				}
				else if (((this->buffer[currentBufferCharacter] >= 65) && (this->buffer[currentBufferCharacter] <= 90)) || ((this->buffer[currentBufferCharacter] >= 97) && (this->buffer[currentBufferCharacter] <= 122)) || (this->buffer[currentBufferCharacter] == '_'))
				{
					currentlyReading = Identifier; //the lexer is going to read an identifier
					currentBufferCharacter--; //the same little trick: i want the lexer to read this character again (when the lexer will expect an identifier)
				}
				else if (this->buffer[currentBufferCharacter] == ';')
				{
					Token newToken;
					newToken.type = TDOTCOMMA;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == '.')
				{
					Token newToken;
					newToken.type = TDOT;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == ',')
				{
					Token newToken;
					newToken.type = TCOMMA;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == '{')
				{
					Token newToken;
					newToken.type = TLBRACE;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == '}')
				{
					Token newToken;
					newToken.type = TRBRACE;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == '(')
				{
					Token newToken;
					newToken.type = TLPAREN;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == ')')
				{
					Token newToken;
					newToken.type = TRPAREN;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == '[')
				{
					Token newToken;
					newToken.type = TLSQRPAREN;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				else if (this->buffer[currentBufferCharacter] == ']')
				{
					Token newToken;
					newToken.type = TRSQRPAREN;
					newToken.row = currentBufferRow;
					newToken.column = currentBufferColumn;
					newToken.data = (char*)NULL;
					this->Tokens->push_back(newToken);
				}
				break;

			default:
				break;
		}
		currentBufferColumn++; //update the number of the character in the current line
		currentBufferCharacter++; //next time I'll read the next character
	}
	//check if the lexer reached the end unexpectly
	if ((currentlyReading != Source) && (currentlyReading != SingleLineComment))
	{
		if (currentlyReading == Identifier)
		{
			size_t characters = temp.size();
			char* identifierStringFromTemp = new char[characters + 1];
			identifierStringFromTemp[characters] = (char)0x00;
			size_t currentChar;
			for (currentChar = 0L; currentChar < characters; currentChar++)
				identifierStringFromTemp[currentChar] = temp[currentChar];
			//create the token
			Token identifierToken;
			identifierToken.type = TIDENTIFIER;
			identifierToken.row = currentBufferRow;
			identifierToken.column = currentBufferColumn;
			identifierToken.data = identifierStringFromTemp;
			this->Tokens->push_back(identifierToken); //save the new token

			/* FUCK YOU! STUPID USER! */

		} else if (currentlyReading == Number) {
			size_t characters = temp.size();
			char* numberStringFromTemp = new char [characters + 1];
			numberStringFromTemp[characters] = (char)0x00;
			size_t currentChar;
			for (currentChar = 0L; currentChar < characters; currentChar++)
				numberStringFromTemp[currentChar] = temp[currentChar];
			//create the token
			Token numberToken;
			numberToken.type = TNUMBER;
			numberToken.row = currentBufferRow;
			numberToken.column = currentBufferColumn;
			numberToken.data = numberStringFromTemp;
			this->Tokens->push_back(numberToken); //save the new token

			/* FUCK YOU! STUPID USER! */

		} else if (currentlyReading == String) {
			//return an error
		} else { //MultiLineComment
			//return an error
		}
	}
	//the lexer's job is done
	return NO_ERRORS;
}

#include <stdio.h>
#include <string.h>
#include <vector>

#include "Lexer.h"

int main(int argc, char** argv)
{
	char* src = "/* sd */ \"This should be displayed \"...# /*s5d*/\n/*cd \"This should not be displayed\"*/#f\n55 6.5H\n5.9B /* 5.7 */ 5f, 5,f\n \"a slash and a n should be displayed: \n\" \"'\"  #comment\n\n ";
	vector<Token> TokensList;
	Lexer Analyzer(src, &TokensList);
	if (Analyzer.Analyze() != 0)
	{
		printf("Si e' verificato un errore!");
	} else {
		printf("Commenti multilinea: %u\nCommenti: %u\n", Analyzer.MultiLineFoundComments, Analyzer.SingleLineFoundComments );
	}

	size_t i;
	for (i = 0; i < TokensList.size(); i++)
	{
		printf("\n\nToken numer %u: \nType: ", (i + 1));
		if (TokensList[i].type == TIDENTIFIER)
		{
			printf("identifier\nValue: ");
			puts(TokensList[i].data);
		}
		else if (TokensList[i].type == TNUMBER)
		{
			printf("number\nValue: ");
			puts(TokensList[i].data);
		}
		else if (TokensList[i].type == TCHAR)
		{
			printf("character\nValue: ");
			puts(TokensList[i].data);
		}
		else if (TokensList[i].type == TSTRING)
		{
			printf("string\nValue: ");
			puts(TokensList[i].data);
		}
	}
	return 0;
}

Altra domanda: credete che dovrei usare shared_ptr invece che passarmi il puntatore? Che benefici avrei nel farlo?

P.S. Non ho capito perchè abbia funzionato durante varie prove, ma il problema è legato al fatto che nella classe ci sia vector<Token>* anzichè vector<Token>. A dire la verità mi scoccia parecchia il fatto che anche se ho ricompilato e provato più volte il problema sia sorto in ritardo e ancora di più avere nella classe lexer un vector<Token>. C'è una alternativa più "elegante"?

Ultima modifica effettuata da TheDarkJuster 21/08/14 21:01

aaa

22/08/14 22:48

pierotofy

Mm, l'unica cosa che e' certamente sbagliata e' il modo in cui popoli il vector Tokens.

Una variabile inizializzata così:

{ // blocco
   Tipo a;
   // ...
} // fine blocco

Verrà deinizializzata all'uscita del suo scope.

#include <iostream>
#include <vector>

using namespace std;

class Test{
public:
    int n;
    Test(int n) : n(n){ cout << "Creato " << n << endl; }
    ~Test() { cout << "Distrutto " << n << endl; } 
};

int main(){
    vector<Test> *p = new vector<Test>();

    {
      Test a(1);
      Test b(2);
      p->push_back(a);
      p->push_back(b);
    }
    
    cout << "Fine" << endl;
}

Ultima modifica effettuata da pierotofy 22/08/14 22:48

Il mio blog: piero.dev

23/08/14 10:09

TheDarkJuster

Giusto, non ci avevo pensato...
A questo punto mi conviene creare nuovi token e memorizzare nel vettore il puntatore

aaa