C/C++ - Problema di allocazione in un parser

18/09/10 18:04
Ciao a tutti, scusate il titolo poco descrittivo ma non sapevo che scrivere: il mio problema è effettivamente riguardante l'allocazione e il salvataggio di dati in un semplice parser che ho scritto. Almeno credo.

Il programma legge del testo in STDIN, lo divide in token di due tipi (parole o caratteri non alfabetici) che vengono salvati in un array di token.
Non capisco dove sbaglio, eppure quando il programma prova a stampare i token salvati, alla fine del processo, stampa qualcosa di strano e si capisce che qualcosa è andato storto...
Scusate i pochi commenti, spero che il codice sia leggibile ugualmente.

Aspetto l'eroe che di darà qualche suggerimento... grazie! :k:
Oltre a qui sotto, il codice lo trovate a sprunge.us/….
/*
*  This program reads from STDIN an arbitrary text which will be tokenized.
*  A token can be either a word or another non-alphabetic character.
*
*  When it finds a '0xFF' char it will parse the seguent char as it was the
*  number of repetitions of the following third char:
*  "Hi everyb0xFF0x390dy"  --->  "Hi everybooooooooody"
*/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#define WORD    1
#define PUNCT   2

/* function declarations */
void parse(int);
void got_token(int);
void add_to_token(int);
void close_token(void);

unsigned int words = 0,         /* amount of words read from the input text */
             puncts = 0,        /* amount of punctuation characters, including spaces, etc. */
             token_num = 0,     /* will be the sum of words + puncts */
             length = 0;        /* the length of a token in chars (temporary var) */

/* pointer to an area of memory containing pointers to tokens */
int **array_token;



int main(void) {
    int c, len;
    array_token = malloc(sizeof(int *));

    /* decompress/read and store STDIN in tokens */
    while (1) {
        c = getchar();
        if (c == EOF) {
            parse(c);
            break;
        }
        if (c == 0xFF) {
            len = getchar();
            c = getchar();
            while (len--)
                parse(c);
        } else
            parse(c);
    }

    /* eventually, do something with the data... */
    printf("Words:\t\t%d\n"             /* total amount of words read */
            "Puncts:\t\t%d\n"           /* total amount of punctuation chars */
            "Tokens:\t\t%d\n"           /* number of tokens */
            "Arrays used:\t%d\n",       /* real number of arrays used */
            words,
            puncts,
            words + puncts,
            token_num + 1);

    /* DOESN't WORK: print out every token */
    c = token_num;
    puts("\nThese are the tokens:\n");
    do {
        printf("%s\n", (char *)array_token[c]);
    } while (c--);

    /* free used memory!! */
    do {
        free(array_token[token_num]);
    } while (token_num--);
    free(array_token);

    return 0;
}

void parse(int c) {
    static enum {
        START, IN_WORD
    } state;

    if (c == EOF) {
        if (state == IN_WORD)
            words++;
        else {
            free(array_token[token_num]);
            if (token_num > 0)
                token_num--;
        }
        close_token();
        return;
    }

    switch (state) {
    case IN_WORD:
        if (isalpha(c)) {
            add_to_token(c);
            return;
        }
        got_token(WORD);
        state = START;
        /* fall through */

    case START:
        add_to_token(c);
        if (isalpha(c))
            state = IN_WORD;
        else
            got_token(PUNCT);
        break;
    }
}

void got_token(int type) {
    switch (type) {
        case WORD:
            words++;
            break;
        case PUNCT:
            puncts++;
            break;
    }

    close_token();
    array_token = realloc(array_token, (++token_num + 1) * sizeof(int *));      /* new token */
}

void add_to_token(int c) {
    /* We should have already inizialized array_token. Let's be safe... */
    if (array_token == NULL)
        array_token = malloc(sizeof(int *));

    /* inizialize the array to store the token if non-existent */
    if (array_token[token_num] == NULL) {
        length = 0;
        array_token[token_num] = malloc(sizeof(int));
        array_token[token_num][length] = c;
    }
    /* otherwise, expand its size in memory */
    else {
        array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
        array_token[token_num][length] = c;
    }
}

/* this is an optional function... */
void close_token() {
    /* realloc the array, expanding its size, in order to add 'Ciao a tutti, scusate il titolo poco descrittivo ma non sapevo che scrivere: il mio problema è effettivamente riguardante l'allocazione e il salvataggio di dati in un semplice parser che ho scritto. Almeno credo.



Il programma legge del testo in STDIN, lo divide in token di due tipi (parole o caratteri non alfabetici) che vengono salvati in un array di token.

Non capisco dove sbaglio, eppure quando il programma prova a stampare i token salvati, alla fine del processo, stampa qualcosa di strano e si capisce che qualcosa è andato storto...

Scusate i pochi commenti, spero che il codice sia leggibile ugualmente.



Aspetto l'eroe che di darà qualche suggerimento... grazie! 



Oltre a qui sotto, il codice lo trovate a sprunge.us/….



/*
*  This program reads from STDIN an arbitrary text which will be tokenized.
*  A token can be either a word or another non-alphabetic character.
*
*  When it finds a '0xFF' char it will parse the seguent char as it was the
*  number of repetitions of the following third char:
*  "Hi everyb0xFF0x390dy"  --->  "Hi everybooooooooody"
*/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#define WORD    1
#define PUNCT   2

/* function declarations */
void parse(int);
void got_token(int);
void add_to_token(int);
void close_token(void);

unsigned int words = 0,         /* amount of words read from the input text */
             puncts = 0,        /* amount of punctuation characters, including spaces, etc. */
             token_num = 0,     /* will be the sum of words + puncts */
             length = 0;        /* the length of a token in chars (temporary var) */

/* pointer to an area of memory containing pointers to tokens */
int **array_token;



int main(void) {
    int c, len;
    array_token = malloc(sizeof(int *));

    /* decompress/read and store STDIN in tokens */
    while (1) {
        c = getchar();
        if (c == EOF) {
            parse(c);
            break;
        }
        if (c == 0xFF) {
            len = getchar();
            c = getchar();
            while (len--)
                parse(c);
        } else
            parse(c);
    }

    /* eventually, do something with the data... */
    printf("Words:\t\t%d\n"             /* total amount of words read */
            "Puncts:\t\t%d\n"           /* total amount of punctuation chars */
            "Tokens:\t\t%d\n"           /* number of tokens */
            "Arrays used:\t%d\n",       /* real number of arrays used */
            words,
            puncts,
            words + puncts,
            token_num + 1);

    /* DOESN't WORK: print out every token */
    c = token_num;
    puts("\nThese are the tokens:\n");
    do {
        printf("%s\n", (char *)array_token[c]);
    } while (c--);

    /* free used memory!! */
    do {
        free(array_token[token_num]);
    } while (token_num--);
    free(array_token);

    return 0;
}

void parse(int c) {
    static enum {
        START, IN_WORD
    } state;

    if (c == EOF) {
        if (state == IN_WORD)
            words++;
        else {
            free(array_token[token_num]);
            if (token_num > 0)
                token_num--;
        }
        close_token();
        return;
    }

    switch (state) {
    case IN_WORD:
        if (isalpha(c)) {
            add_to_token(c);
            return;
        }
        got_token(WORD);
        state = START;
        /* fall through */

    case START:
        add_to_token(c);
        if (isalpha(c))
            state = IN_WORD;
        else
            got_token(PUNCT);
        break;
    }
}

void got_token(int type) {
    switch (type) {
        case WORD:
            words++;
            break;
        case PUNCT:
            puncts++;
            break;
    }

    close_token();
    array_token = realloc(array_token, (++token_num + 1) * sizeof(int *));      /* new token */
}

void add_to_token(int c) {
    /* We should have already inizialized array_token. Let's be safe... */
    if (array_token == NULL)
        array_token = malloc(sizeof(int *));

    /* inizialize the array to store the token if non-existent */
    if (array_token[token_num] == NULL) {
        length = 0;
        array_token[token_num] = malloc(sizeof(int));
        array_token[token_num][length] = c;
    }
    /* otherwise, expand its size in memory */
    else {
        array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
        array_token[token_num][length] = c;
    }
}

/* this is an optional function... */
void close_token() {
    /* realloc the array, expanding its size, in order to add '{parsed_message}' at the end */
    array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
    array_token[token_num][length] = '{parsed_message}';
}' at the end */
    array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
    array_token[token_num][length] = 'Ciao a tutti, scusate il titolo poco descrittivo ma non sapevo che scrivere: il mio problema è effettivamente riguardante l'allocazione e il salvataggio di dati in un semplice parser che ho scritto. Almeno credo.



Il programma legge del testo in STDIN, lo divide in token di due tipi (parole o caratteri non alfabetici) che vengono salvati in un array di token.

Non capisco dove sbaglio, eppure quando il programma prova a stampare i token salvati, alla fine del processo, stampa qualcosa di strano e si capisce che qualcosa è andato storto...

Scusate i pochi commenti, spero che il codice sia leggibile ugualmente.



Aspetto l'eroe che di darà qualche suggerimento... grazie! 



Oltre a qui sotto, il codice lo trovate a sprunge.us/….



/*
*  This program reads from STDIN an arbitrary text which will be tokenized.
*  A token can be either a word or another non-alphabetic character.
*
*  When it finds a '0xFF' char it will parse the seguent char as it was the
*  number of repetitions of the following third char:
*  "Hi everyb0xFF0x390dy"  --->  "Hi everybooooooooody"
*/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#define WORD    1
#define PUNCT   2

/* function declarations */
void parse(int);
void got_token(int);
void add_to_token(int);
void close_token(void);

unsigned int words = 0,         /* amount of words read from the input text */
             puncts = 0,        /* amount of punctuation characters, including spaces, etc. */
             token_num = 0,     /* will be the sum of words + puncts */
             length = 0;        /* the length of a token in chars (temporary var) */

/* pointer to an area of memory containing pointers to tokens */
int **array_token;



int main(void) {
    int c, len;
    array_token = malloc(sizeof(int *));

    /* decompress/read and store STDIN in tokens */
    while (1) {
        c = getchar();
        if (c == EOF) {
            parse(c);
            break;
        }
        if (c == 0xFF) {
            len = getchar();
            c = getchar();
            while (len--)
                parse(c);
        } else
            parse(c);
    }

    /* eventually, do something with the data... */
    printf("Words:\t\t%d\n"             /* total amount of words read */
            "Puncts:\t\t%d\n"           /* total amount of punctuation chars */
            "Tokens:\t\t%d\n"           /* number of tokens */
            "Arrays used:\t%d\n",       /* real number of arrays used */
            words,
            puncts,
            words + puncts,
            token_num + 1);

    /* DOESN't WORK: print out every token */
    c = token_num;
    puts("\nThese are the tokens:\n");
    do {
        printf("%s\n", (char *)array_token[c]);
    } while (c--);

    /* free used memory!! */
    do {
        free(array_token[token_num]);
    } while (token_num--);
    free(array_token);

    return 0;
}

void parse(int c) {
    static enum {
        START, IN_WORD
    } state;

    if (c == EOF) {
        if (state == IN_WORD)
            words++;
        else {
            free(array_token[token_num]);
            if (token_num > 0)
                token_num--;
        }
        close_token();
        return;
    }

    switch (state) {
    case IN_WORD:
        if (isalpha(c)) {
            add_to_token(c);
            return;
        }
        got_token(WORD);
        state = START;
        /* fall through */

    case START:
        add_to_token(c);
        if (isalpha(c))
            state = IN_WORD;
        else
            got_token(PUNCT);
        break;
    }
}

void got_token(int type) {
    switch (type) {
        case WORD:
            words++;
            break;
        case PUNCT:
            puncts++;
            break;
    }

    close_token();
    array_token = realloc(array_token, (++token_num + 1) * sizeof(int *));      /* new token */
}

void add_to_token(int c) {
    /* We should have already inizialized array_token. Let's be safe... */
    if (array_token == NULL)
        array_token = malloc(sizeof(int *));

    /* inizialize the array to store the token if non-existent */
    if (array_token[token_num] == NULL) {
        length = 0;
        array_token[token_num] = malloc(sizeof(int));
        array_token[token_num][length] = c;
    }
    /* otherwise, expand its size in memory */
    else {
        array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
        array_token[token_num][length] = c;
    }
}

/* this is an optional function... */
void close_token() {
    /* realloc the array, expanding its size, in order to add '{parsed_message}' at the end */
    array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
    array_token[token_num][length] = '{parsed_message}';
}';
}
Ultima modifica effettuata da TheWorm 23/09/10 20:16
aaa
28/09/10 22:49
TheWorm
Nessuno eh? Ho provato a riscrivere il codice salvando i token in una linked list.. senza successo.

... na non c'è proprio anima viva? Nessuno può aiutarmi?
aaa
30/09/10 17:12
TheWorm
Bah, intanto ho abbellito un po' il codice... che continua a non funzionare!
sprunge.us/…
aaa