2013-04-02 11:21:33 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
'''
|
|
|
|
ponysay - Ponysay, cowsay reimplementation for ponies
|
|
|
|
Copyright (C) 2012, 2013 Erkin Batu Altunbaş et al.
|
|
|
|
|
|
|
|
This program is free software. It comes without any warranty, to
|
|
|
|
the extent permitted by applicable law. You can redistribute it
|
|
|
|
and/or modify it under the terms of the Do What The Fuck You Want
|
|
|
|
To Public License, Version 2, as published by Sam Hocevar. See
|
|
|
|
http://sam.zoy.org/wtfpl/COPYING for more details.
|
|
|
|
'''
|
|
|
|
from common import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
Class used for correcting spellos and typos,
|
|
|
|
|
|
|
|
Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.
|
|
|
|
It is also limited to words of size 0 to 127 (inclusive)
|
|
|
|
'''
|
|
|
|
class SpelloCorrecter(): # Naïvely and quickly ported and adapted from optimised Java, may not be the nicest, or even fast, Python code
|
|
|
|
'''
|
|
|
|
Constructor
|
|
|
|
|
|
|
|
@param directories:list<str> List of directories that contains the file names with the correct spelling
|
|
|
|
@param ending:str The file name ending of the correctly spelled file names, this is removed for the name
|
2013-04-03 17:03:42 +02:00
|
|
|
|
|
|
|
-- OR -- (emulated overloading [overloading is absent in Python])
|
|
|
|
|
|
|
|
@param directories:list<str> The file names with the correct spelling
|
2013-04-02 11:21:33 +02:00
|
|
|
'''
|
2013-04-03 17:03:42 +02:00
|
|
|
def __init__(self, directories, ending = None):
|
2013-04-02 11:21:33 +02:00
|
|
|
self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},
|
|
|
|
'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},
|
|
|
|
's' : {'z' : 0.25, 'c' : 0.5},
|
|
|
|
'z' : {'s' : 0.25, 'c' : 0.5},
|
|
|
|
'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},
|
|
|
|
'o' : {'u' : 0.5},
|
|
|
|
'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},
|
|
|
|
'b' : {'v' : 0.75},
|
|
|
|
'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},
|
|
|
|
'w' : {'v' : 0.5, 'u' : 0.5},
|
|
|
|
'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}
|
|
|
|
|
|
|
|
self.corrections = None
|
|
|
|
self.dictionary = [None] * 513
|
|
|
|
self.reusable = [0] * 512
|
|
|
|
self.dictionaryEnd = 512
|
|
|
|
self.closestDistance = 0
|
|
|
|
|
|
|
|
self.M = [None] * 128
|
|
|
|
for y in range(0, 128):
|
|
|
|
self.M[y] = [0] * 128
|
|
|
|
self.M[y][0] = y
|
|
|
|
m0 = self.M[0]
|
|
|
|
x = 127
|
|
|
|
while x > -1:
|
|
|
|
m0[x] = x
|
|
|
|
x -= 1
|
|
|
|
|
|
|
|
previous = ''
|
|
|
|
self.dictionary[-1] = previous;
|
|
|
|
|
2013-04-03 17:03:42 +02:00
|
|
|
if ending is not None:
|
|
|
|
for directory in directories:
|
|
|
|
for filename in os.listdir(directory):
|
|
|
|
if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):
|
|
|
|
continue
|
|
|
|
proper = filename[:-len(ending)]
|
|
|
|
|
|
|
|
if self.dictionaryEnd == 0:
|
|
|
|
self.dictionaryEnd = len(self.dictionary)
|
|
|
|
self.reusable = [0] * self.dictionaryEnd + self.reusable
|
|
|
|
self.dictionary = [None] * self.dictionaryEnd + self.dictionary
|
|
|
|
|
|
|
|
self.dictionaryEnd -= 1
|
|
|
|
self.dictionary[self.dictionaryEnd] = proper
|
|
|
|
|
|
|
|
prevCommon = min(len(previous), len(proper))
|
|
|
|
for i in range(0, prevCommon):
|
|
|
|
if previous[i] != proper[i]:
|
|
|
|
prevCommon = i
|
|
|
|
break
|
|
|
|
previous = proper
|
|
|
|
self.reusable[self.dictionaryEnd] = prevCommon
|
|
|
|
else:
|
|
|
|
for proper in directories:
|
|
|
|
if len(proper) > 127:
|
2013-04-02 11:21:33 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
if self.dictionaryEnd == 0:
|
|
|
|
self.dictionaryEnd = len(self.dictionary)
|
|
|
|
self.reusable = [0] * self.dictionaryEnd + self.reusable
|
|
|
|
self.dictionary = [None] * self.dictionaryEnd + self.dictionary
|
|
|
|
|
|
|
|
self.dictionaryEnd -= 1
|
|
|
|
self.dictionary[self.dictionaryEnd] = proper
|
|
|
|
|
|
|
|
prevCommon = min(len(previous), len(proper))
|
|
|
|
for i in range(0, prevCommon):
|
|
|
|
if previous[i] != proper[i]:
|
|
|
|
prevCommon = i
|
|
|
|
break
|
|
|
|
previous = proper
|
|
|
|
self.reusable[self.dictionaryEnd] = prevCommon
|
|
|
|
#part = self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1]
|
|
|
|
#part.sort()
|
|
|
|
#self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1] = part
|
|
|
|
#
|
|
|
|
#index = len(self.dictionary) - 1
|
|
|
|
#while index >= self.dictionaryEnd:
|
|
|
|
# proper = self.dictionary[index]
|
|
|
|
# prevCommon = min(len(previous), len(proper))
|
|
|
|
# for i in range(0, prevCommon):
|
|
|
|
# if previous[i] != proper[i]:
|
|
|
|
# prevCommon = i
|
|
|
|
# break
|
|
|
|
# previous = proper
|
|
|
|
# self.reusable[self.dictionaryEnd] = prevCommon
|
|
|
|
# index -= 1;
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
Finds the closests correct spelled word
|
|
|
|
|
|
|
|
@param used:str The word to correct
|
|
|
|
@return (words, distance):(list<string>, int) A list the closest spellings and the weighted distance
|
|
|
|
'''
|
|
|
|
def correct(self, used):
|
|
|
|
if len(used) > 127:
|
|
|
|
return ([used], 0)
|
|
|
|
|
|
|
|
self.__correct(used)
|
|
|
|
return (self.corrections, self.closestDistance)
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
Finds the closests correct spelled word
|
|
|
|
|
|
|
|
@param used:str The word to correct, it must satisfy all restrictions
|
|
|
|
'''
|
|
|
|
def __correct(self, used):
|
|
|
|
self.closestDistance = 0x7FFFFFFF
|
|
|
|
previous = self.dictionary[-1]
|
|
|
|
prevLen = 0
|
|
|
|
usedLen = len(used)
|
|
|
|
|
|
|
|
proper = None
|
|
|
|
prevCommon = 0
|
|
|
|
|
|
|
|
d = len(self.dictionary) - 1
|
|
|
|
while d > self.dictionaryEnd:
|
|
|
|
d -= 1
|
|
|
|
proper = self.dictionary[d]
|
|
|
|
if abs(len(proper) - usedLen) <= self.closestDistance:
|
|
|
|
if previous == self.dictionary[d + 1]:
|
|
|
|
prevCommon = self.reusable[d];
|
|
|
|
else:
|
|
|
|
prevCommon = min(prevLen, len(proper))
|
|
|
|
for i in range(0, prevCommon):
|
|
|
|
if previous[i] != proper[i]:
|
|
|
|
prevCommon = i
|
|
|
|
break
|
|
|
|
|
|
|
|
skip = min(prevLen, len(proper))
|
|
|
|
i = prevCommon
|
|
|
|
while i < skip:
|
|
|
|
for u in range(0, usedLen):
|
|
|
|
if (used[u] == previous[i]) or (used[u] == proper[i]):
|
|
|
|
skip = i
|
|
|
|
break
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
common = min(skip, min(usedLen, len(proper)))
|
|
|
|
for i in range(0, common):
|
|
|
|
if used[i] != proper[i]:
|
|
|
|
common = i
|
|
|
|
break
|
|
|
|
|
|
|
|
distance = self.__distance(proper, skip, len(proper), used, common, usedLen)
|
|
|
|
|
|
|
|
if self.closestDistance > distance:
|
|
|
|
self.closestDistance = distance
|
|
|
|
self.corrections = [proper]
|
|
|
|
elif self.closestDistance == distance:
|
|
|
|
self.corrections.append(proper)
|
|
|
|
|
|
|
|
previous = proper;
|
|
|
|
if distance >= 0x7FFFFF00:
|
|
|
|
prevLen = distance & 255
|
|
|
|
else:
|
|
|
|
prevLen = len(proper)
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
Calculate the distance between a correct word and a incorrect word
|
|
|
|
|
|
|
|
@param proper:str The correct word
|
|
|
|
@param y0:int The offset for `proper`
|
|
|
|
@param yn:int The length, before applying `y0`, of `proper`
|
|
|
|
@param used:str The incorrect word
|
|
|
|
@param x0:int The offset for `used`
|
|
|
|
@param xn:int The length, before applying `x0`, of `used`
|
|
|
|
@return :float The distance between the words
|
|
|
|
'''
|
|
|
|
def __distance(self, proper, y0, yn, used, x0, xn):
|
|
|
|
my = self.M[y0]
|
|
|
|
for y in range(y0, yn):
|
|
|
|
best = 0x7FFFFFFF
|
|
|
|
p = proper[y]
|
|
|
|
myy = self.M[y + 1] # only one array bound check, and at most one + ☺
|
|
|
|
x = x0
|
|
|
|
while x < xn:
|
|
|
|
change = my[x]
|
|
|
|
u = used[x]
|
|
|
|
if p == u:
|
|
|
|
# commence black magick … twilight would be so disappointed
|
|
|
|
x += 1
|
|
|
|
myy[x] = change
|
|
|
|
best = min(best, change)
|
|
|
|
remove = myy[x]
|
|
|
|
add = my[x + 1]
|
|
|
|
|
|
|
|
cw = 1
|
|
|
|
if my[x] in self.weights:
|
|
|
|
if p in self.weights[u]:
|
|
|
|
cw = self.weights[u][p]
|
|
|
|
x += 1
|
|
|
|
|
|
|
|
myy[x] = min(cw + change, 1 + min(remove, add))
|
|
|
|
if best > myy[x]:
|
|
|
|
best = myy[x]
|
|
|
|
|
|
|
|
if best > self.closestDistance:
|
|
|
|
return 0x7FFFFF00 | y
|
|
|
|
my = myy
|
|
|
|
return my[xn]
|
|
|
|
|