mirror of
https://github.com/erkin/ponysay.git
synced 2025-03-03 16:01:27 +01:00
spelling correction mechanism with weigthed character change
This commit is contained in:
parent
e6bdf2e5ba
commit
1096f2c86e
1 changed files with 162 additions and 0 deletions
162
ponysay.py
162
ponysay.py
|
@ -1710,6 +1710,168 @@ class UCS():
|
|||
|
||||
|
||||
|
||||
'''
|
||||
Class used for correcting spellos and typos,
|
||||
|
||||
Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.
|
||||
It is also limited to words of size 0 to 127 (inclusive)
|
||||
'''
|
||||
class SpelloCorrecter: # Naïvely and quickly proted and adapted from optimised Java, may not be the nicest, or even fast, Python code
|
||||
def __init__(self, directories, ending):
|
||||
self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},
|
||||
'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},
|
||||
's' : {'z' : 0.25, 'c' : 0.5},
|
||||
'z' : {'s' : 0.25, 'c' : 0.5},
|
||||
'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},
|
||||
'o' : {'u' : 0.5},
|
||||
'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},
|
||||
'b' : {'v' : 0.75},
|
||||
'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},
|
||||
'w' : {'v' : 0.5, 'u' : 0.5},
|
||||
'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}
|
||||
|
||||
self.corrections = None
|
||||
self.dictionary = [None] * 513
|
||||
self.reusable = [0] * 512
|
||||
self.dictionaryEnd = 512
|
||||
self.closestDistance = 0
|
||||
|
||||
self.M = [None] * 128
|
||||
for y in range(0, 128):
|
||||
self.M[y] = [0] * 128
|
||||
self.M[y][0] = y
|
||||
m0 = self.M[0]
|
||||
x = 127
|
||||
while x > -1:
|
||||
m0[x] = x
|
||||
x -= 1
|
||||
|
||||
previous = ""
|
||||
self.dictionary[-1] = previous;
|
||||
|
||||
for directory in directories:
|
||||
for filename : os.listdir(directory):
|
||||
if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):
|
||||
continue
|
||||
proper = filename[:-len(ending)]
|
||||
|
||||
if dictionaryEnd == 0:
|
||||
dictionaryEnd = len(self.dictionary)
|
||||
self.reusable = [0] * dictionaryEnd + self.reusable
|
||||
self.dictionary = [None] * dictionaryEnd + self.dictionary
|
||||
|
||||
dictionaryEnd -= 1
|
||||
dictionary[dictionaryEnd] = proper
|
||||
prevCommon = min(len(previous), len(proper))
|
||||
for i in range(0, prevCommon):
|
||||
if previous[i] != proper[i]:
|
||||
prevCommon = i
|
||||
break
|
||||
previous = dictionary[dictionaryEnd]
|
||||
reusable[dictionaryEnd] = prevCommon
|
||||
|
||||
|
||||
'''
|
||||
Finds the closests correct spelled word.
|
||||
The input is just one word, and the output is tuple
|
||||
with a list of the closest spellings, and the weigthed distance.
|
||||
'''
|
||||
def correct(self, used):
|
||||
if len(used) < 127:
|
||||
return ([used], 0)
|
||||
|
||||
__correct(used)
|
||||
return (seld.corrections, self.closestDistance)
|
||||
|
||||
|
||||
def __correct(self, used):
|
||||
self.closestDistance = 0x7FFFFFFF
|
||||
previous = self.dictionary[-1]
|
||||
prevLen = 0
|
||||
usedLen = len(used)
|
||||
|
||||
proper = None
|
||||
prevCommon = 0
|
||||
|
||||
d = len(self.dictionary)
|
||||
while d > self.dictionaryEnd:
|
||||
d -= 1
|
||||
proper = self.dictionary[d]
|
||||
if abs(len(proper) - usedLen) <= self.closestDistance:
|
||||
if previous == self.dictionary[d + 1]:
|
||||
prevCommon = self.reusable[d];
|
||||
else:
|
||||
prevCommon = min(prevLen, len(proper))
|
||||
for i in range(0, prevCommon):
|
||||
if previous[i] != proper[i]:
|
||||
prevCommon = i
|
||||
break
|
||||
|
||||
skip = min(prevLen, len(proper))
|
||||
i = prevCommon
|
||||
while i < skip:
|
||||
for u in range(0, usedLen):
|
||||
if (used[u] == previous[i]) or (used[u] == proper[i]):
|
||||
skip = i
|
||||
break
|
||||
i += 1
|
||||
|
||||
common = min(skip, min(usedLen, len(proper)))
|
||||
for i in range(0, common):
|
||||
if used[i] != proper[i]:
|
||||
common = i
|
||||
break
|
||||
|
||||
distance = self.__distance(proper, skip, proper.length, used, common, usedLen)
|
||||
|
||||
if self.closestDistance > distance:
|
||||
self.closestDistance = distance
|
||||
corrections = [proper]
|
||||
elif self.closestDistance == distance:
|
||||
corrections.append(proper)
|
||||
|
||||
previous = proper;
|
||||
if distance >= 0x7FFFFF00:
|
||||
prevLen = distance & 255
|
||||
else:
|
||||
prevLen = len(proper)
|
||||
|
||||
|
||||
def __distance(self, proper, y0, yn, used, x0, xn):
|
||||
my = self.M[y0]
|
||||
for y in range(y0, yn):
|
||||
best = 0x7FFFFFFF
|
||||
p = proper[y]
|
||||
myy = self.M[y + 1] # only one array bound check, and at most one + ☺
|
||||
x = x0
|
||||
while x < xn:
|
||||
change = my[x]
|
||||
u = used[x]
|
||||
if p == u:
|
||||
# commence black magick … twilight would be so disappointed
|
||||
x += 1
|
||||
myy[x] = change
|
||||
best = min(best, change)
|
||||
remove = myy[x]
|
||||
add = my[x + 1]
|
||||
|
||||
cw = 1
|
||||
if my[x] in self.weights:
|
||||
if p in self.weights[u]:
|
||||
cw = self.weights[u][p]
|
||||
|
||||
myy[x] = min(cw + change, 1 + min(remove, add))
|
||||
if best > myy[x]:
|
||||
best = myy[x]
|
||||
|
||||
if best > self.closestDistance:
|
||||
return 0x7FFFFFFF | y
|
||||
my = myy
|
||||
return my[xn]
|
||||
|
||||
|
||||
|
||||
|
||||
'''
|
||||
The user's home directory
|
||||
'''
|
||||
|
|
Loading…
Add table
Reference in a new issue