Mega Code Archive

 
Categories / C / Small Application
 

Filter text with a stop word list

#include <stdio.h> #include <stdlib.h> #include <string.h> #define MAXTOKENS 256 #define MAXLINE 1024 #define MINLEN 3 #define STMINLEN 2 struct tnode { char *word; int count; struct tnode *left, *right; }; struct tnode *buildstoptree(char *, struct tnode *); struct tnode *addtree(struct tnode *, char *); struct tnode *findstopword(struct tnode *, char *); struct tnode *talloc(void); void freetree(struct tnode *); char **split(char *, char *); int main(int argc, char *argv[]) { /* delim does not include \' [\047] quote */ char *delim = ".,:;`\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890"; char **tokens = NULL; struct tnode *root = {0}; struct tnode *querry = {0}; char line[MAXLINE]; int i = 0; if(argc != 2) { fprintf(stderr, "Usage: tokstop STOPLIST.txt\n"); return 1; } root = buildstoptree(argv[1], root); if(root == NULL) return 1; while(fgets(line, MAXLINE, stdin) != NULL) { if(strlen(line) < MINLEN) continue; tokens = split(line, delim); for(i = 0; tokens[i] != NULL; i++) { querry = findstopword(root, tokens[i]); if(querry == NULL) printf("%s ", tokens[i]); } for(i = 0; tokens[i] != NULL; i++) free(tokens[i]); free(tokens[i]); printf("\n"); } freetree(root); return 0; } /* read stoplist into binary tree, expects one entry per line */ struct tnode *buildstoptree(char *fname, struct tnode *p) { FILE *fp = {0}; char line[MAXLINE]; int len = 0, lcount = 0; fp = fopen(fname, "r"); if(fp == NULL) { fprintf(stderr, "Error - fopen(%s)\n", fname); return NULL; } while(fgets(line, MAXLINE, fp) != NULL) { len = strlen(line); if(len < STMINLEN) continue; else lcount++; if(line[len - 1] == '\n') line[--len] = '\0'; p = addtree(p, line); } if(lcount == 0) { fprintf(stderr, "Error - Zero stopwords..\n"); return NULL; } fclose(fp); return p; } /* split string into tokens, return token array */ char **split(char *string, char *delim) { char **tokens = NULL; char *working = NULL; char *token = NULL; int idx = 0; tokens = malloc(sizeof(char *) * MAXTOKENS); if(tokens == NULL) return NULL; working = malloc(sizeof(char) * strlen(string) + 1); if(working == NULL) return NULL; /* to make sure, copy string to a safe place */ strcpy(working, string); for(idx = 0; idx < MAXTOKENS; idx++) tokens[idx] = NULL; token = strtok(working, delim); idx = 0; /* always keep the last entry NULL terminated */ while((idx < (MAXTOKENS - 1)) && (token != NULL)) { tokens[idx] = malloc(sizeof(char) * strlen(token) + 1); if(tokens[idx] != NULL) { strcpy(tokens[idx], token); idx++; token = strtok(NULL, delim); } } free(working); return tokens; } /* install word in binary tree */ struct tnode *addtree(struct tnode *p, char *w) { int cond; if(p == NULL) { p = talloc(); p->word = strdup(w); p->count = 1; p->left = p->right = NULL; } else if((cond = strcmp(w, p->word)) == 0) p->count++; else if(cond < 0) p->left = addtree(p->left, w); else p->right = addtree(p->right, w); return p; } /* make new tnode */ struct tnode *talloc(void) { return(struct tnode *)malloc(sizeof(struct tnode)); } /* find value w in binary tree */ struct tnode *findstopword(struct tnode *p, char *w) { struct tnode *temp; int cond = 0; temp = p; while(temp != NULL) { if((cond = strcmp(temp->word, w)) == 0) return temp; else if(cond > 0) temp = temp->left; else temp = temp->right; } return NULL; } /* free binary tree */ void freetree(struct tnode *p) { if(p != NULL) { free(p->left); free(p->right); free(p->word); free(p); } }