I come back with another long question. After experimenting with several Python-based Damerau-Levenshtein to edit distances, Finally, I found one that is listed below as editdistance_reference(). This seems to give the correct results and seems to have an effective implementation.
So, I decided to convert the code to Cython. according to my test data, the reference method allows you to deliver results for 11,000 comparisons (for word pairs 12 letters long), while the Keatonization method allows 200,000 comparisons per second. Unfortunately, the results are incorrect: when you look at the variable thisrow
that I print for debugging, my version fills them regardless of what data I throw on it, while another image is displayed on the reference output. For example, testing 'helo'for 'world'
produces the following result ( EDdenotes my function, EDRis a valid working link):
From editdistance():
#ED A [0, 0, 0, 0, 0, 1]
#ED B [1, 0, 0, 0, 0, 1]
#ED B [1, 1, 0, 0, 0, 1]
#ED B [1, 1, 1, 0, 0, 1]
#ED B [1, 1, 1, 1, 0, 1]
#ED B [1, 1, 1, 1, 1, 1]
#ED A [0, 0, 0, 0, 0, 2]
#ED B [1, 0, 0, 0, 0, 2]
#ED B [1, 1, 0, 0, 0, 2]
#ED B [1, 1, 1, 0, 0, 2]
#ED B [1, 1, 1, 1, 0, 2]
#ED B [1, 1, 1, 1, 1, 2]
#ED A [0, 0, 0, 0, 0, 3]
#ED B [1, 0, 0, 0, 0, 3]
#ED B [1, 1, 0, 0, 0, 3]
#ED B [1, 1, 1, 0, 0, 3]
#ED B [1, 1, 1, 1, 0, 3]
#ED B [1, 1, 1, 1, 1, 3]
#ED A [0, 0, 0, 0, 0, 4]
#ED B [1, 0, 0, 0, 0, 4]
#ED B [1, 1, 0, 0, 0, 4]
#ED B [1, 1, 1, 0, 0, 4]
#ED B [1, 1, 1, 1, 0, 4]
#ED B [1, 1, 1, 1, 1, 4]
from editdistance_reference():
#EDR A [0, 0, 0, 0, 0, 1]
#EDR B [1, 0, 0, 0, 0, 1]
#EDR B [1, 2, 0, 0, 0, 1]
#EDR B [1, 2, 3, 0, 0, 1]
#EDR B [1, 2, 3, 4, 0, 1]
#EDR B [1, 2, 3, 4, 5, 1]
#EDR A [0, 0, 0, 0, 0, 2]
#EDR B [2, 0, 0, 0, 0, 2]
#EDR B [2, 2, 0, 0, 0, 2]
#EDR B [2, 2, 3, 0, 0, 2]
#EDR B [2, 2, 3, 4, 0, 2]
#EDR B [2, 2, 3, 4, 5, 2]
#EDR A [0, 0, 0, 0, 0, 3]
#EDR B [3, 0, 0, 0, 0, 3]
#EDR B [3, 3, 0, 0, 0, 3]
#EDR B [3, 3, 3, 0, 0, 3]
#EDR B [3, 3, 3, 3, 0, 3]
#EDR B [3, 3, 3, 3, 4, 3]
#EDR A [0, 0, 0, 0, 0, 4]
#EDR B [4, 0, 0, 0, 0, 4]
#EDR B [4, 4, 0, 0, 0, 4]
#EDR B [4, 4, 4, 0, 0, 4]
#EDR B [4, 4, 4, 4, 0, 4]
#EDR B [4, 4, 4, 4, 4, 4]
I have to be very stupid, as a mistake is probably one of those very obvious things. but I can not find it.
: malloc twoago, oneago thisrow,
. free( twoago ) ..,
line, glibc double free or corruption. googled ; ,
- glibc , ?
setup.py,
(/path/to/python3.1 ./setup.py build_ext --inplace), ,
.
: Python3.1; , *.pyx
unicode, print , .
, , , , ,
. , , editdistance(), ,
, .
setup.py:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
name = 'cython_dameraulevenshtein',
ext_modules = [
Extension( 'cython_dameraulevenshtein', [ 'cython_dameraulevenshtein.pyx', ] ), ],
cmdclass = {
'build_ext': build_ext }, )
cython_dameraulevenshtein.pyx ( , ):
cdef extern from "stdlib.h":
ctypedef unsigned int size_t
void *malloc(size_t size)
void *realloc( void *ptr, size_t size )
void free(void *ptr)
cdef inline unsigned int _minimum_of_two_uints( unsigned int a, unsigned int b ):
if a < b: return a
return b
cdef inline unsigned int _minimum_of_three_uints( unsigned int a, unsigned int b, unsigned int c ):
if a < b:
if c < a:
return c
return a
if c < b:
return c
return b
cdef inline int _warp( unsigned int limit, int value ):
return value if value >= 0 else limit + value
cdef class Array_of_unsigned_int:
cdef unsigned int *data
cdef unsigned int length
def __cinit__( self, unsigned int length, fill_value = None ):
self.length = length
self.data = <unsigned int *>malloc( length * sizeof( unsigned int ) )
if fill_value is not None:
self.fill( fill_value )
cdef fill( self, unsigned int value ):
cdef unsigned int idx
cdef unsigned int *d = self.data
for idx from 0 <= idx < self.length:
d[ idx ] = value
cdef resize( self, unsigned int length ):
self.data = <unsigned int *>realloc( self.data, length * sizeof( unsigned int ) )
self.length = length
def free( self ):
"""Always remember the milk: Free up memory."""
free( self.data )
def as_list( self ):
"""Return the array as a Python list."""
R = []
cdef unsigned int idx
cdef unsigned int *d = self.data
for idx from 0 <= idx < self.length:
R.append( d[ idx ] )
return R
cdef unsigned int _UMX_surrogate_lower_bound = 0x10000
cdef unsigned int _UMX_surrogate_upper_bound = 0x10ffff
cdef unsigned int _UMX_surrogate_hi_lower_bound = 0xd800
cdef unsigned int _UMX_surrogate_hi_upper_bound = 0xdbff
cdef unsigned int _UMX_surrogate_lo_lower_bound = 0xdc00
cdef unsigned int _UMX_surrogate_lo_upper_bound = 0xdfff
cdef unsigned int _UMX_surrogate_foobar_factor = 0x400
cdef Array_of_unsigned_int _cids_from_text( text ):
"""Givn a ``text`` either as a Unicode string or as a ``bytes`` or ``bytearray``, return an instance of
``Array_of_unsigned_int`` that enumerates either the Unicode codepoints of each character or the value of
each byte. Surrogate pairs will be condensed into single values, so on narrow Python builds the length of
the array returned may be less than ``len( text )``."""
is_bytes = isinstance( text, ( bytes, bytearray, ) )
assert is_bytes or isinstance( text, str ), '#121'
cdef unsigned int length = <unsigned int>len( text )
cdef Array_of_unsigned_int R = Array_of_unsigned_int( length )
if length == 0: return R
cdef unsigned int idx = 0
if is_bytes:
for idx from 0 <= idx < length:
R.data[ idx ] = <unsigned int>text[ idx ]
return R
cdef unsigned int cid = 0
cdef bool is_surrogate = False
cdef unsigned int hi = 0
cdef unsigned int lo = 0
cdef unsigned int chr_count = 0
for idx from 0 <= idx < length:
if is_surrogate:
lo = <unsigned int>ord( text[ idx ] )
cid = ( ( hi - _UMX_surrogate_hi_lower_bound ) * _UMX_surrogate_foobar_factor
+ ( lo - _UMX_surrogate_lo_lower_bound ) + _UMX_surrogate_lower_bound )
is_surrogate = False
else:
cid = <unsigned int>ord( text[ idx ] )
if _UMX_surrogate_hi_lower_bound <= cid <= _UMX_surrogate_hi_upper_bound:
hi = cid
is_surrogate = True
continue
R.data[ chr_count ] = cid
chr_count += 1
if chr_count != length:
R.resize( chr_count )
return R
def cids_from_text( text ):
cdef Array_of_unsigned_int c_R =_cids_from_text( text )
R = c_R.as_list()
c_R.free()
return R
cpdef float similarity( char *a, char *b ):
"""Given two byte strings ``a`` and ``b``, return their Damerau-Levenshtein similarity as a float between
0.0 and 1.1. Similarity is computed as ``1 - relative_editdistance( a, b )``, so a result of ``1.0``
indicates identity, while ``0.0`` indicates complete dissimilarity."""
return 1.0 - relative_editdistance( a, b )
cpdef float relative_editdistance( char *a, char *b ):
"""Given two byte strings ``a`` and ``b``, return their relative Damerau-Levenshtein distance. The return
value is a float between 0.0 and 1.0; it is calculated as the absolute edit distance, divided by the
length of the longer string. Therefore, ``0.0`` indicates identity, while ``1.0`` indicates complete
dissimilarity."""
cdef int length = max( len( a ), len( b ) )
if length == 0: return 0.0
return editdistance( a, b ) / <float>length
cpdef unsigned int editdistance( text_a, text_b ):
"""Given texts as Unicode strings or ``bytes`` / ``bytearray`` objects, return their absolute
Damerau-Levenshtein distance. Each deletion, insertion, substitution, and transposition is counted as one
difference, so the edit distance between ``abc`` and ``ab``, ``abcx``, ``abx``, ``acb``, respectively, is
``1``."""
if text_a == text_b: return 0
cdef Array_of_unsigned_int a = _cids_from_text( text_a )
cdef Array_of_unsigned_int b = _cids_from_text( text_b )
R = c_editdistance( a, b )
a.free()
b.free()
return R
cdef unsigned int c_editdistance( Array_of_unsigned_int cids_a, Array_of_unsigned_int cids_b ):
cdef unsigned int a_length = cids_a.length
cdef unsigned int b_length = cids_b.length
if a_length == 0: return b_length
if b_length == 0: return a_length
cdef unsigned int row_length = b_length + 1
cdef unsigned int row_length_1 = row_length - 1
cdef unsigned int row_bytecount = sizeof( unsigned int ) * row_length
cdef unsigned int *oneago = <unsigned int *>malloc( row_bytecount )
cdef unsigned int *twoago = <unsigned int *>malloc( row_bytecount )
cdef unsigned int *thisrow = <unsigned int *>malloc( row_bytecount )
cdef unsigned int idx = 0
cdef unsigned int idx_a = 0
cdef unsigned int idx_b = 0
cdef int idx_a_1_text = 0
cdef int idx_b_1_row = 0
cdef int idx_b_2_row = 0
cdef int idx_b_1_text = 0
cdef unsigned int deletion_cost = 0
cdef unsigned int addition_cost = 0
cdef unsigned int substitution_cost = 0
for idx from 1 <= idx < row_length:
thisrow[ idx - 1 ] = idx
thisrow[ row_length - 1 ] = 0
for idx_a from 0 <= idx_a < a_length:
idx_a_1_text = _warp( a_length, idx_a - 1 )
twoago, oneago = oneago, thisrow
for idx from 0 <= idx < row_length_1:
thisrow[ idx ] = 0
thisrow[ row_length - 1 ] = idx_a + 1
x = []
for idx from 0 <= idx < row_length: x.append( thisrow[ idx ] )
print
print '#ED A', x
for idx_b from 0 <= idx_b < b_length:
idx_b_1_row = _warp( row_length, idx_b - 1 )
idx_b_1_text = _warp( b_length, idx_b - 1 )
assert 0 <= idx_b_1_row < row_length, ( '#323', idx_b_1_row, )
assert 0 <= idx_a_1_text < a_length, ( '#324', idx_a_1_text, )
assert 0 <= idx_b_1_text < b_length, ( '#325', idx_b_1_text, )
deletion_cost = oneago[ idx_b ] + 1
addition_cost = thisrow[ idx_b_1_row ] + 1
substitution_cost = oneago[ idx_b_1_row ] + ( 1 if cids_a.data[ idx_a ]
!= cids_b.data[ idx_b ] else 0 )
thisrow[ idx_b ] = _minimum_of_three_uints( deletion_cost, addition_cost, substitution_cost )
if ( idx_a > 0
and idx_b > 0
and cids_a.data[ idx_a ] == cids_b.data[ idx_b_1_text ]
and cids_a.data[ idx_a_1_text ] == cids_b.data[ idx_b ]
and cids_a.data[ idx_a ] != cids_b.data[ idx_b ] ):
idx_b_2_row = _warp( row_length, idx_b - 2 )
assert 0 <= idx_b_2_row < row_length, ( '#340', idx_b_2_row, )
thisrow[ idx_b ] = _minimum_of_two_uints( thisrow[ idx_b ], twoago[ idx_b_2_row ] + 1 )
x = []
for idx from 0 <= idx < row_length: x.append( thisrow[ idx ] )
print '#ED B', x
cdef unsigned int R = thisrow[ b_length - 1 ]
return R
def editdistance_reference( text_a, text_b ):
"""This method is believed to compute a correct Damerau-Levenshtein edit distance, with deletions,
insertions, substitutions, and transpositions. Do not touch it; it is here to validate results returned
from the above method. Code adapted from
http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance"""
b_length = len( text_b )
oneago = None
thisrow = list( range( 1, b_length + 1 ) ) + [ 0 ]
for idx_a in range( len( text_a ) ):
twoago, oneago, thisrow = oneago, thisrow, [ 0 ] * b_length + [ idx_a + 1 ]
print
print '#EDR A', thisrow
for idx_b in range( b_length ):
deletion_cost = oneago[ idx_b ] + 1
addition_cost = thisrow[ idx_b - 1 ] + 1
substitution_cost = oneago[ idx_b - 1 ] + ( text_a[ idx_a ] != text_b[ idx_b ] )
thisrow[ idx_b ] = min( deletion_cost, addition_cost, substitution_cost )
if ( idx_a > 0
and idx_b > 0
and text_a[ idx_a ] == text_b[ idx_b - 1 ]
and text_a[ idx_a - 1 ] == text_b[ idx_b ]
and text_a[ idx_a ] != text_b[ idx_b ] ):
thisrow[ idx_b ] = min( thisrow[ idx_b ], twoago[ idx_b - 2 ] + 1 )
print '#EDR B', thisrow
return thisrow[ len( text_b ) - 1 ]
edit pastebin Cython.