2013-04-02 11:21:33 +02:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
ponysay - Ponysay, cowsay reimplementation for ponies
|
2013-04-03 20:34:46 +02:00
|
|
|
|
|
2014-02-03 20:39:18 +01:00
|
|
|
|
Copyright (C) 2012, 2013, 2014 Erkin Batu Altunbaş et al.
|
2013-04-02 11:21:33 +02:00
|
|
|
|
|
2013-04-03 20:34:46 +02:00
|
|
|
|
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
If you intend to redistribute ponysay or a fork of it commercially,
|
|
|
|
|
it contains aggregated images, some of which may not be commercially
|
|
|
|
|
redistribute, you would be required to remove those. To determine
|
|
|
|
|
whether or not you may commercially redistribute an image make use
|
|
|
|
|
that line ‘FREE: yes’, is included inside the image between two ‘$$$’
|
|
|
|
|
lines and the ‘FREE’ is and upper case and directly followed by
|
|
|
|
|
the colon.
|
2013-04-02 11:21:33 +02:00
|
|
|
|
'''
|
|
|
|
|
from common import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpelloCorrecter(): # Naïvely and quickly ported and adapted from optimised Java, may not be the nicest, or even fast, Python code
|
|
|
|
|
'''
|
2013-08-12 07:54:31 +02:00
|
|
|
|
Class used for correcting spellos and typos,
|
2013-04-02 11:21:33 +02:00
|
|
|
|
|
2013-08-12 07:54:31 +02:00
|
|
|
|
Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.
|
|
|
|
|
It is also limited to words of size 0 to 127 (inclusive)
|
2013-04-02 11:21:33 +02:00
|
|
|
|
'''
|
2013-08-12 07:54:31 +02:00
|
|
|
|
|
2013-04-03 17:03:42 +02:00
|
|
|
|
def __init__(self, directories, ending = None):
|
2013-08-12 07:54:31 +02:00
|
|
|
|
'''
|
|
|
|
|
Constructor
|
|
|
|
|
|
|
|
|
|
@param directories:list<str> List of directories that contains the file names with the correct spelling
|
|
|
|
|
@param ending:str The file name ending of the correctly spelled file names, this is removed for the name
|
|
|
|
|
|
|
|
|
|
-- OR -- (emulated overloading [overloading is absent in Python])
|
|
|
|
|
|
|
|
|
|
@param directories:list<str> The file names with the correct spelling
|
|
|
|
|
'''
|
2013-04-02 11:21:33 +02:00
|
|
|
|
self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},
|
|
|
|
|
'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},
|
|
|
|
|
's' : {'z' : 0.25, 'c' : 0.5},
|
|
|
|
|
'z' : {'s' : 0.25, 'c' : 0.5},
|
|
|
|
|
'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},
|
|
|
|
|
'o' : {'u' : 0.5},
|
|
|
|
|
'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},
|
|
|
|
|
'b' : {'v' : 0.75},
|
|
|
|
|
'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},
|
|
|
|
|
'w' : {'v' : 0.5, 'u' : 0.5},
|
|
|
|
|
'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}
|
|
|
|
|
|
|
|
|
|
self.corrections = None
|
|
|
|
|
self.dictionary = [None] * 513
|
|
|
|
|
self.reusable = [0] * 512
|
|
|
|
|
self.dictionaryEnd = 512
|
|
|
|
|
self.closestDistance = 0
|
|
|
|
|
|
|
|
|
|
self.M = [None] * 128
|
|
|
|
|
for y in range(0, 128):
|
|
|
|
|
self.M[y] = [0] * 128
|
|
|
|
|
self.M[y][0] = y
|
|
|
|
|
m0 = self.M[0]
|
|
|
|
|
x = 127
|
|
|
|
|
while x > -1:
|
|
|
|
|
m0[x] = x
|
|
|
|
|
x -= 1
|
|
|
|
|
|
|
|
|
|
previous = ''
|
|
|
|
|
self.dictionary[-1] = previous;
|
|
|
|
|
|
2013-04-03 17:03:42 +02:00
|
|
|
|
if ending is not None:
|
|
|
|
|
for directory in directories:
|
2013-04-04 01:44:17 +02:00
|
|
|
|
files = os.listdir(directory)
|
|
|
|
|
files.sort()
|
|
|
|
|
for filename in files:
|
2013-04-03 17:03:42 +02:00
|
|
|
|
if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):
|
|
|
|
|
continue
|
|
|
|
|
proper = filename[:-len(ending)]
|
|
|
|
|
|
|
|
|
|
if self.dictionaryEnd == 0:
|
|
|
|
|
self.dictionaryEnd = len(self.dictionary)
|
|
|
|
|
self.reusable = [0] * self.dictionaryEnd + self.reusable
|
|
|
|
|
self.dictionary = [None] * self.dictionaryEnd + self.dictionary
|
|
|
|
|
|
|
|
|
|
self.dictionaryEnd -= 1
|
|
|
|
|
self.dictionary[self.dictionaryEnd] = proper
|
|
|
|
|
|
|
|
|
|
prevCommon = min(len(previous), len(proper))
|
|
|
|
|
for i in range(0, prevCommon):
|
|
|
|
|
if previous[i] != proper[i]:
|
|
|
|
|
prevCommon = i
|
|
|
|
|
break
|
|
|
|
|
previous = proper
|
|
|
|
|
self.reusable[self.dictionaryEnd] = prevCommon
|
|
|
|
|
else:
|
2013-04-04 01:44:17 +02:00
|
|
|
|
files = directories
|
|
|
|
|
files.sort()
|
|
|
|
|
for proper in files:
|
2013-04-03 17:03:42 +02:00
|
|
|
|
if len(proper) > 127:
|
2013-04-02 11:21:33 +02:00
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if self.dictionaryEnd == 0:
|
|
|
|
|
self.dictionaryEnd = len(self.dictionary)
|
|
|
|
|
self.reusable = [0] * self.dictionaryEnd + self.reusable
|
|
|
|
|
self.dictionary = [None] * self.dictionaryEnd + self.dictionary
|
|
|
|
|
|
|
|
|
|
self.dictionaryEnd -= 1
|
|
|
|
|
self.dictionary[self.dictionaryEnd] = proper
|
|
|
|
|
|
|
|
|
|
prevCommon = min(len(previous), len(proper))
|
|
|
|
|
for i in range(0, prevCommon):
|
|
|
|
|
if previous[i] != proper[i]:
|
|
|
|
|
prevCommon = i
|
|
|
|
|
break
|
|
|
|
|
previous = proper
|
|
|
|
|
self.reusable[self.dictionaryEnd] = prevCommon
|
|
|
|
|
#part = self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1]
|
|
|
|
|
#part.sort()
|
|
|
|
|
#self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1] = part
|
|
|
|
|
#
|
|
|
|
|
#index = len(self.dictionary) - 1
|
|
|
|
|
#while index >= self.dictionaryEnd:
|
|
|
|
|
# proper = self.dictionary[index]
|
|
|
|
|
# prevCommon = min(len(previous), len(proper))
|
|
|
|
|
# for i in range(0, prevCommon):
|
|
|
|
|
# if previous[i] != proper[i]:
|
|
|
|
|
# prevCommon = i
|
|
|
|
|
# break
|
|
|
|
|
# previous = proper
|
|
|
|
|
# self.reusable[self.dictionaryEnd] = prevCommon
|
|
|
|
|
# index -= 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def correct(self, used):
|
2013-08-12 07:54:31 +02:00
|
|
|
|
'''
|
|
|
|
|
Finds the closests correct spelled word
|
|
|
|
|
|
|
|
|
|
@param used:str The word to correct
|
|
|
|
|
@return (words, distance):(list<string>, int) A list the closest spellings and the weighted distance
|
|
|
|
|
'''
|
2013-04-02 11:21:33 +02:00
|
|
|
|
if len(used) > 127:
|
|
|
|
|
return ([used], 0)
|
|
|
|
|
|
|
|
|
|
self.__correct(used)
|
|
|
|
|
return (self.corrections, self.closestDistance)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __correct(self, used):
|
2013-08-12 07:54:31 +02:00
|
|
|
|
'''
|
|
|
|
|
Finds the closests correct spelled word
|
|
|
|
|
|
|
|
|
|
@param used:str The word to correct, it must satisfy all restrictions
|
|
|
|
|
'''
|
2013-04-02 11:21:33 +02:00
|
|
|
|
self.closestDistance = 0x7FFFFFFF
|
|
|
|
|
previous = self.dictionary[-1]
|
|
|
|
|
prevLen = 0
|
|
|
|
|
usedLen = len(used)
|
|
|
|
|
|
|
|
|
|
proper = None
|
|
|
|
|
prevCommon = 0
|
|
|
|
|
|
|
|
|
|
d = len(self.dictionary) - 1
|
|
|
|
|
while d > self.dictionaryEnd:
|
|
|
|
|
d -= 1
|
|
|
|
|
proper = self.dictionary[d]
|
|
|
|
|
if abs(len(proper) - usedLen) <= self.closestDistance:
|
|
|
|
|
if previous == self.dictionary[d + 1]:
|
|
|
|
|
prevCommon = self.reusable[d];
|
|
|
|
|
else:
|
|
|
|
|
prevCommon = min(prevLen, len(proper))
|
|
|
|
|
for i in range(0, prevCommon):
|
|
|
|
|
if previous[i] != proper[i]:
|
|
|
|
|
prevCommon = i
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
skip = min(prevLen, len(proper))
|
|
|
|
|
i = prevCommon
|
|
|
|
|
while i < skip:
|
|
|
|
|
for u in range(0, usedLen):
|
|
|
|
|
if (used[u] == previous[i]) or (used[u] == proper[i]):
|
|
|
|
|
skip = i
|
|
|
|
|
break
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
common = min(skip, min(usedLen, len(proper)))
|
|
|
|
|
for i in range(0, common):
|
|
|
|
|
if used[i] != proper[i]:
|
|
|
|
|
common = i
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
distance = self.__distance(proper, skip, len(proper), used, common, usedLen)
|
|
|
|
|
|
|
|
|
|
if self.closestDistance > distance:
|
|
|
|
|
self.closestDistance = distance
|
|
|
|
|
self.corrections = [proper]
|
|
|
|
|
elif self.closestDistance == distance:
|
|
|
|
|
self.corrections.append(proper)
|
|
|
|
|
|
|
|
|
|
previous = proper;
|
|
|
|
|
if distance >= 0x7FFFFF00:
|
|
|
|
|
prevLen = distance & 255
|
|
|
|
|
else:
|
|
|
|
|
prevLen = len(proper)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __distance(self, proper, y0, yn, used, x0, xn):
|
2013-08-12 07:54:31 +02:00
|
|
|
|
'''
|
|
|
|
|
Calculate the distance between a correct word and a incorrect word
|
|
|
|
|
|
|
|
|
|
@param proper:str The correct word
|
|
|
|
|
@param y0:int The offset for `proper`
|
|
|
|
|
@param yn:int The length, before applying `y0`, of `proper`
|
|
|
|
|
@param used:str The incorrect word
|
|
|
|
|
@param x0:int The offset for `used`
|
|
|
|
|
@param xn:int The length, before applying `x0`, of `used`
|
|
|
|
|
@return :float The distance between the words
|
|
|
|
|
'''
|
2013-04-02 11:21:33 +02:00
|
|
|
|
my = self.M[y0]
|
|
|
|
|
for y in range(y0, yn):
|
|
|
|
|
best = 0x7FFFFFFF
|
|
|
|
|
p = proper[y]
|
|
|
|
|
myy = self.M[y + 1] # only one array bound check, and at most one + ☺
|
|
|
|
|
x = x0
|
|
|
|
|
while x < xn:
|
|
|
|
|
change = my[x]
|
|
|
|
|
u = used[x]
|
|
|
|
|
if p == u:
|
|
|
|
|
# commence black magick … twilight would be so disappointed
|
|
|
|
|
x += 1
|
|
|
|
|
myy[x] = change
|
|
|
|
|
best = min(best, change)
|
|
|
|
|
remove = myy[x]
|
|
|
|
|
add = my[x + 1]
|
|
|
|
|
|
|
|
|
|
cw = 1
|
|
|
|
|
if my[x] in self.weights:
|
|
|
|
|
if p in self.weights[u]:
|
|
|
|
|
cw = self.weights[u][p]
|
|
|
|
|
x += 1
|
|
|
|
|
|
|
|
|
|
myy[x] = min(cw + change, 1 + min(remove, add))
|
|
|
|
|
if best > myy[x]:
|
|
|
|
|
best = myy[x]
|
|
|
|
|
|
|
|
|
|
if best > self.closestDistance:
|
|
|
|
|
return 0x7FFFFF00 | y
|
|
|
|
|
my = myy
|
|
|
|
|
return my[xn]
|
|
|
|
|
|