# pike.git/src/builtin.cmod

Branch: Tag:

## 2014-12-04

#### 2014-12-04 19:23:23 by Markus Ottensmann <markuso@opera.com>

• 06dcae14b343782bde53419a8cc9fcd36457417b (40 lines) (+40/-0) [ Show | Annotate ]
Branch: bill/master_archive_support
Add function String.levenshtein_distance()

That function calculates the Levenshtein distance between two strings.
The Levenshtein distance describes the minimum number of edit operations
insert, delete or substitue a character to get from one string to the other.

The algorithm can be used in approximate string matching to find matches for
a short string in many longer texts, when a small number of differences is
expected.

3945:   /*! @endclass    */    + PMOD_EXPORT + PIKEFUN int levenshtein_distance(string a, string b) + { +  int i, j, n, *lev_i, *lev_p; +  +  /* Simple cases: strings are equal or one of them is empty: */ +  if (a == b) RETURN 0; +  if (a->len == 0) RETURN b->len; +  if (b->len == 0) RETURN a->len; +  +  /* Return -1 if any of the strings is wider than 8 bits: */ +  if (a->size_shift || b->size_shift) RETURN -1; +  +  /* Allocate two rows on the stack: */ +  n = b->len+1; +  lev_i = alloca(n*sizeof(int)); +  lev_p = alloca(n*sizeof(int)); +  if (!lev_i || !lev_p) RETURN -1; +  +  /* Initialise the first row */ +  for (j = 0; j < n; j++) lev_i[j] = j; +  +  for (i = 0; i < a->len; i++) +  { +  /* lev_p = row for i, lev_i = row for i+1: */ +  memcpy(lev_p, lev_i, n*sizeof(int)); +  lev_i[0] = i + 1; +  for (j = 0; j < b->len; j++) +  { +  int cost = (a->str[i] == b->str[j]) ? 0 : 1; +  int test, min_val = lev_i[j]+1; +  if ((test = lev_p[j+1]+1) < min_val) min_val = test; +  if ((test = lev_p[j]+cost) < min_val) min_val = test; +  lev_i[j+1] = min_val; +  } +  } +  RETURN lev_i[b->len]; + } +    /*! @endmodule    */