Mega Code Archive

 
Categories / C / Small Application
 

Show string difference, distance

/* // usage ... command-line: // ~: ./strplot 'to be or not to be' 'to be or not to be' // token based dotplot: // [+][ ][ ][ ][+][ ] // [ ][+][ ][ ][ ][+] // [ ][ ][+][ ][ ][ ] // [ ][ ][ ][+][ ][ ] // [+][ ][ ][ ][+][ ] // [ ][+][ ][ ][ ][+] // // character based distance: // levenshtein distance: 0 // one : to be or not to be // two : to be or not to be */ #include <stdio.h> #include <stdlib.h> #include <string.h> #define MAXTOKENS 256 #define MAXLINE 1024 char **split(char *string, char *delim); int levenshtein(char *stra, char *strb); int main(int argc, char *argv[]) { char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890"; char *str1 = NULL; char *str2 = NULL; char **stok1 = NULL; char **stok2 = NULL; int i = 0, j = 0; if(argc != 3) return 1; else if(strlen(argv[1]) < 5 || strlen(argv[2]) < 5) return 1; else { str1 = argv[1]; str2 = argv[2]; } stok1 = split(str1, delim); stok2 = split(str2, delim); printf("token based dotplot:\n"); for(i = 0; stok1[i] != NULL; i++) { for(j = 0; stok2[j] != NULL; j++) { if(strcmp(stok1[i], stok2[j]) == 0) printf("[+]"); else printf("[ ]"); } printf("\n"); } printf("\n"); printf("character based distance:\n"); printf("levenshtein distance: %d\n", levenshtein(str1, str2)); printf("one : %s\n", str1); printf("two : %s\n", str2); for(i = 0; stok1[i] != NULL; i++) free(stok1[i]); free(stok1); for(i = 0; stok2[i] != NULL; i++) free(stok2[i]); free(stok2); return 0; } /* calculate levenshtein distance */ int levenshtein(char *stra, char *strb) { int i, j, k, l, m1, m2, m3, laa, r; int lengtha, lengthb, lengthmin; int **d = 0; char *a = 0; i = j = k = l = m1 = m2 = m3 = laa = r = 0; lengtha = lengthb = lengthmin = 0; lengtha = strlen(stra); lengthb = strlen(strb); lengthmin = lengtha; if(lengthb > lengthmin) lengthmin = lengthb; a = (char *)malloc((lengthmin + 2) * sizeof(char)); d = (int **)malloc((lengthmin + 2) * sizeof(int *)); for(i = 0; i < lengthmin + 2; i++) d[i] = (int *)malloc((lengthmin + 2) * sizeof(int)); for(i = 0; i <= lengtha; i++) a[i] = stra[i]; laa = lengtha; d[0][0] = 0; for(i = 1; i <= lengtha; i++) d[i][0] = d[i-1][0]+1; for(j = 1; j <= lengthb; j++) d[0][j] = d[0][j-1]+1; for(i = 1; i <= laa; i++) { for(j = 1;j <= lengthb; j++) { r = 0; if(a[i-1] != strb[j-1]) { r = 1; a[i-1] = strb[j-1]; } m1 = d[i-1][j-1]+r; for(k = lengtha; k >= i-1; k--) a[k+1] = a[k]; a[j-1] = strb[j-1]; lengtha++; m2 = d[i][j-1]+1; for(k = i-1; k < lengtha; k++) a[k] = a[k+1]; lengtha--; m3 = d[i-1][j]+1; r = m1; if( m2 < r) r = m2; if(m3 < r) r = m3; d[i][j] = r; lengtha = laa; for(l = 0; l <= lengtha; l++) a[l] = stra[l]; } } r = d[lengtha][lengthb]; free(a); for(i = 0; i < lengthmin+2; i++) free(d[i]); free(d); return(r); } /* split string into tokens, return token array */ char **split(char *string, char *delim) { char **tokens = NULL; char *working = NULL; char *token = NULL; int idx = 0; tokens = malloc(sizeof(char *) * MAXTOKENS); if(tokens == NULL) return NULL; working = malloc(sizeof(char) * strlen(string) + 1); if(working == NULL) return NULL; strcpy(working, string); for(idx = 0; idx < MAXTOKENS; idx++) tokens[idx] = NULL; token = strtok(working, delim); idx = 0; /* always keep the last entry NULL terminated */ while((idx < (MAXTOKENS - 1)) && (token != NULL)) { tokens[idx] = malloc(sizeof(char) * strlen(token) + 1); if(tokens[idx] != NULL) { strcpy(tokens[idx], token); idx++; token = strtok(NULL, delim); } } free(working); return tokens; }