Mega Code Archive

 
Categories / C / Small Application
 

Extract character ngrams from textdata

#include <stdio.h> #include <stdlib.h> #include <string.h> #define MAXLINE 1024 #define MINLEN 3 /* print all ngrams for `str' */ void printgrams(char *, int); /* padd token to, one prefix, and (N - 1) affix */ char *mkpadgr(char *, char *, int); int main(int argc, char *argv[]) { char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890"; char *token = NULL; char line[MAXLINE]; int nglen, i; i = nglen = 0; if(argc != 2) { fprintf(stderr, "Usage: chargram INT\n"); return 1; } else nglen = atoi(argv[1]); while(fgets(line, MAXLINE, stdin) != NULL) { if(strlen(line) < MINLEN) continue; token = strtok(line, delim); while(token != NULL) { printgrams(token, nglen); token = strtok(NULL, delim); } } return 0; } /* print all ngrams for `str' */ void printgrams(char *str, int N) { char *padded = NULL; char *gram = NULL; int i = 0, j = 0; padded = mkpadgr(str, "_", N); for(i = 0; i < strlen(str) + 1; i++) { gram = &padded[i]; for(j = 0; j < N; j++) { printf("%c", gram[j]); } printf("\n"); } free(padded); return; } /* padd word according to one prefix, and (N - 1) affix */ char *mkpadgr(char *str, char *padd, int N) { char *buff = NULL; int i = 0; buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char)); strcat(buff, padd), strcat(buff, str); for(i = 0; i < (N - 1); i++) strcat(buff, padd); return buff; }