Mega Code Archive

 
Categories / C / Small Application
 

Calc. score intersecting ngrams

#include <stdio.h> #include <stdlib.h> #include <string.h> char **mkcgram(char *, int); char *mkpadgr(char *, char *, int); char *strndup(const char *, size_t); int strnlen(const char *, int); int main(int argc, char *argv[]) { char **cgram1 = NULL; char **cgram2 = NULL; int dupcount = 0, nglen = 0; int i = 0, j = 0; if(argc != 4) { fprintf(stderr, "Usage: ngramisect INT WORD1 WORD2\n"); return 1; } nglen = atoi(argv[1]); /* get ngrams for first word */ cgram1 = mkcgram(argv[2], nglen); for(i = 0; i < strlen(argv[2]) + 1; i++) printf("cgram1[%d] = %s\n", i, cgram1[i]); printf("---\n"); /* get ngrams for second word */ cgram2 = mkcgram(argv[3], nglen); for(i = 0; i < strlen(argv[3]) + 1; i++) printf("cgram2[%d] = %s\n", i, cgram2[i]); /* compare two arrays, count duplicates */ for(i = 0; i < strlen(argv[2]) + 1; i++) for(j = 0; j < strlen(argv[3]) + 1; j++) if(strcmp(cgram1[i], cgram2[j]) == 0) dupcount++; /* calc. score */ printf("---\n"); printf("total ngrams : %d\n", strlen(argv[2]) + 1 + strlen(argv[3]) + 1); printf("duplicates : %d\n", dupcount); printf("uniq ngrams : %d\n", (strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount); printf("score : %0.2f\n", (double) dupcount / ((strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount)); /* clean up .. */ for(i = 0; i < strlen(argv[2]) + 1; i++) free(cgram1[i]); free(cgram1); for(i = 0; i < strlen(argv[3]) + 1; i++) free(cgram2[i]); free(cgram2); return 0; } /* return array of ngrams */ char **mkcgram(char *str, int N) { char **retval = NULL; char *padded = NULL; int i = 0; padded = mkpadgr(str, "_", N); retval = malloc((strlen(str) + 2) * sizeof(char *)); for(i = 0; i < strlen(str) + 1; i++) retval[i] = strndup(&padded[i], N); free(padded); return retval; } /* padd word according to one prefix, and (N - 1) affix */ char *mkpadgr(char *str, char *padd, int N) { char *buff = NULL; int i = 0; buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char)); strcat(buff, padd), strcat(buff, str); for(i = 0; i < (N - 1); i++) strcat(buff, padd); return buff; } char *strndup(const char *str, size_t n) { char *retval = NULL; size_t len = 0; len = strnlen(str, n); retval = malloc(len + 1); if(retval == NULL) return NULL; retval[len] = '\0'; return (char *)memcpy(retval, str, len); } int strnlen(const char *str, int max) { const char *end = NULL; end = memchr(str, '\0', max); return end ? end - str : max; }