16/08/14 13:17
TheDarkJuster
Sto scrivendo un lexer per un linguaggi di programmazione simile al c, ma sto avendo dei problemi strani. Vi posto i miei sorgenti:
Lexer.h
Lexer.cpp
e qui arriva il bello.... Lo provo:
E mi accorgo che c'è un problema:
5f non è nella lista dei numeri..... Tuttavia il lexer lo inserisce se dopo 5f aggiungo un altro carattere.
Qualcuno ha una idea sul perchè?
Utilizzare regex o generatori di lexer è una decisione che ho scartato e su cui non intendo tornare.
Lexer.h
#ifndef __LEXER__ #define __LEXER__ #include "Token.h" #include <malloc.h> #include <string.h> #include <vector> using namespace std; #define EMPTY_BUFFER -1 #define NULL_BUFFER -2 #define NO_ERRORS 0 enum LexReading { MultiLineComment, SingleLineComment, Number, String, Identifier, Source }; class Lexer { public: Lexer(char*); short Analyze(void); //lexer data char* buffer; unsigned long bufferLength; //lex result vector<Token> Tokens; vector<char*> Identifiers; vector<char*> Strings; vector<char*> Numbers; vector<char> Characters; //statistics unsigned long MultiLineFoundComments; unsigned long SingleLineFoundComments; }; #endif
Lexer.cpp
#include "Lexer.h" Lexer::Lexer(char* buffer) { this->buffer = buffer; this->bufferLength = strlen(buffer); this->MultiLineFoundComments = 0L; this->SingleLineFoundComments = 0L; } short Lexer::Analyze(void) { if (this->buffer == (char*)NULL) return NULL_BUFFER; else if (this->bufferLength == 0) return EMPTY_BUFFER; //what is the lexer reading? LexReading currentlyReading; unsigned long currentBufferCharacter = 0L; //the character that the leer will read vector<char> temp; //a temporary vector to store the characters of identifiers, numbers, strings and chars while (currentBufferCharacter < (this->bufferLength - 1)) { switch (currentlyReading) { case MultiLineComment: //if this is the end of the comment if ((this->buffer[currentBufferCharacter] == '*') && (this->buffer[currentBufferCharacter + 1] == '/')) { currentlyReading = Source; //the lexer is going to read source code again this->MultiLineFoundComments++; //update the number of multi line comments lexed currentBufferCharacter++; //the lexer won't read the / character the next step } //else do nothing, I don't care about comments break; case SingleLineComment: //if this is the end of the line if (this->buffer[currentBufferCharacter] == '\n') { currentlyReading = Source; //the lexer is going to read source code again this->SingleLineFoundComments++; //update the number of single line comments lexed } //else do nothing, I don't care about comments break; case Number: if (((this->buffer[currentBufferCharacter] < 48) || (this->buffer[currentBufferCharacter] > 57)) && ((this->buffer[currentBufferCharacter] != '.') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'X') && (this->buffer[currentBufferCharacter] != 'B') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'x') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'a')&& (this->buffer[currentBufferCharacter] != 'A') && (this->buffer[currentBufferCharacter] != 'b') && (this->buffer[currentBufferCharacter] != 'C') && (this->buffer[currentBufferCharacter] != 'c') && (this->buffer[currentBufferCharacter] != 'd') && (this->buffer[currentBufferCharacter] != 'D') && (this->buffer[currentBufferCharacter] != 'e') && (this->buffer[currentBufferCharacter] != 'E') && (this->buffer[currentBufferCharacter] != 'f') && (this->buffer[currentBufferCharacter] != 'F'))) { size_t characters = temp.size(); char* numberStringFromTemp = (char*)malloc((unsigned long long int)(sizeof(char) * characters + 1)); numberStringFromTemp[characters - 1] = (char)0x00; numberStringFromTemp[characters] = (char)0x00; size_t currentChar; for (currentChar = 0L; currentChar < characters; currentChar++) numberStringFromTemp[currentChar] = temp[currentChar]; this->Numbers.push_back(numberStringFromTemp); this->Tokens.push_back(TNUMBER); temp.clear(); currentlyReading = Source; } else { temp.push_back(this->buffer[currentBufferCharacter]); //store the read character } break; case Source: //if this is the beginning of a single line comment if (this->buffer[currentBufferCharacter] == '#') //single line comments aren't C-like { currentlyReading = SingleLineComment; //the lexer is going to read a single line comment } else if ((this->buffer[currentBufferCharacter] == '/') && (this->buffer[currentBufferCharacter + 1] == '*')) //multi line comments are C-like { currentlyReading = MultiLineComment; //the lexer is going to read a multi line comment currentBufferCharacter++; //the lexer won't read the * simbol } else if ((this->buffer[currentBufferCharacter] >= 48) && (this->buffer[currentBufferCharacter] <= 57)) // ASCII code of 0 is 48 and of 9 is 57 { currentlyReading = Number; //the lexer is going to read a number currentBufferCharacter--; //a little trick: i want the lexer to read this character again (when the lexer will expect a number) } break; default: break; } currentBufferCharacter++; //next time I'll read the next character } //the lexer's job is done return NO_ERRORS; }
e qui arriva il bello.... Lo provo:
#include <stdio.h> #include <string.h> #include <vector> #include "Lexer.h" int main(int argc, char** argv) { char* src = "/* sd */ # /*s5d*/\n/*cd*/#f\n55 6.5H\n5.9B /* 5.7 */ 5f\n"; Lexer Analyzer(src); if (Analyzer.Analyze() != 0) { printf("Si e' verificato un errore!"); } else { printf("Commenti multilinea: %u\nCommenti: %u\n", Analyzer.MultiLineFoundComments, Analyzer.SingleLineFoundComments ); } size_t i; printf("Numero di numeri: %u\nNumeri: ", Analyzer.Numbers.size()); for (i = 0; i < Analyzer.Numbers.size(); i++) { puts(Analyzer.Numbers[i]); printf(" "); } return 0; }
E mi accorgo che c'è un problema:
5f non è nella lista dei numeri..... Tuttavia il lexer lo inserisce se dopo 5f aggiungo un altro carattere.
Qualcuno ha una idea sul perchè?
Utilizzare regex o generatori di lexer è una decisione che ho scartato e su cui non intendo tornare.
aaa