ponysay/src/spellocorrecter.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'''
ponysay - Ponysay, cowsay reimplementation for ponies
Copyright (C) 2012, 2013  Erkin Batu Altunbaş et al.

This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://sam.zoy.org/wtfpl/COPYING for more details.
'''
from common import *


'''
Class used for correcting spellos and typos,

Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.
It is also limited to words of size 0 to 127 (inclusive)
'''
class SpelloCorrecter(): # Naïvely and quickly ported and adapted from optimised Java, may not be the nicest, or even fast, Python code
    '''
    Constructor
    
    @param  directories:list<str>  List of directories that contains the file names with the correct spelling
    @param  ending:str             The file name ending of the correctly spelled file names, this is removed for the name
    '''
    def __init__(self, directories, ending):
        self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},
                        'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},
                        's' : {'z' : 0.25, 'c' : 0.5},
                        'z' : {'s' : 0.25, 'c' : 0.5},
                        'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},
                        'o' : {'u' : 0.5},
                        'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},
                        'b' : {'v' : 0.75},
                        'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},
                        'w' : {'v' : 0.5, 'u' : 0.5},
                        'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}
        
        self.corrections = None
        self.dictionary = [None] * 513
        self.reusable = [0] * 512
        self.dictionaryEnd = 512
        self.closestDistance = 0
        
        self.M = [None] * 128
        for y in range(0, 128):
            self.M[y] = [0] * 128
            self.M[y][0] = y
        m0 = self.M[0]
        x = 127
        while x > -1:
            m0[x] = x
            x -= 1
        
        previous = ''
        self.dictionary[-1] = previous;
        
        for directory in directories:
            for filename in os.listdir(directory):
                if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):
                    continue
                proper = filename[:-len(ending)]
                
                if self.dictionaryEnd == 0:
                    self.dictionaryEnd = len(self.dictionary)
                    self.reusable = [0] * self.dictionaryEnd + self.reusable
                    self.dictionary = [None] * self.dictionaryEnd + self.dictionary
                
                self.dictionaryEnd -= 1
                self.dictionary[self.dictionaryEnd] = proper
                
                prevCommon = min(len(previous), len(proper))
                for i in range(0, prevCommon):
                    if previous[i] != proper[i]:
                        prevCommon = i
                        break
                previous = proper
                self.reusable[self.dictionaryEnd] = prevCommon
        #part = self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1]
        #part.sort()
        #self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1] = part
        #
        #index = len(self.dictionary) - 1
        #while index >= self.dictionaryEnd:
        #    proper = self.dictionary[index]
        #    prevCommon = min(len(previous), len(proper))
        #    for i in range(0, prevCommon):
        #        if previous[i] != proper[i]:
        #            prevCommon = i
        #            break
        #    previous = proper
        #    self.reusable[self.dictionaryEnd] = prevCommon
        #    index -= 1;    
    
    
    '''
    Finds the closests correct spelled word
    
    @param   used:str                               The word to correct
    @return  (words, distance):(list<string>, int)  A list the closest spellings and the weighted distance
    '''
    def correct(self, used):
        if len(used) > 127:
            return ([used], 0)
        
        self.__correct(used)
        return (self.corrections, self.closestDistance)
    
    
    '''
    Finds the closests correct spelled word
    
    @param  used:str  The word to correct, it must satisfy all restrictions
    '''
    def __correct(self, used):
        self.closestDistance = 0x7FFFFFFF
        previous = self.dictionary[-1]
        prevLen = 0
        usedLen = len(used)
        
        proper = None
        prevCommon = 0
        
        d = len(self.dictionary) - 1
        while d > self.dictionaryEnd:
            d -= 1
            proper = self.dictionary[d]
            if abs(len(proper) - usedLen) <= self.closestDistance:
                if previous == self.dictionary[d + 1]:
                    prevCommon = self.reusable[d];
                else:
                    prevCommon = min(prevLen, len(proper))
                    for i in range(0, prevCommon):
                        if previous[i] != proper[i]:
                            prevCommon = i
                            break
                
                skip = min(prevLen, len(proper))
                i = prevCommon
                while i < skip:
                    for u in range(0, usedLen):
                        if (used[u] == previous[i]) or (used[u] == proper[i]):
                            skip = i
                            break
                    i += 1
                
                common = min(skip, min(usedLen, len(proper)))
                for i in range(0, common):
                    if used[i] != proper[i]:
                        common = i
                        break
                
                distance = self.__distance(proper, skip, len(proper), used, common, usedLen)
                
                if self.closestDistance > distance:
                    self.closestDistance = distance
                    self.corrections = [proper]
                elif self.closestDistance == distance:
                    self.corrections.append(proper)
                
                previous = proper;
                if distance >= 0x7FFFFF00:
                    prevLen = distance & 255
                else:
                    prevLen = len(proper)
    
    
    '''
    Calculate the distance between a correct word and a incorrect word
    
    @param   proper:str  The correct word
    @param   y0:int      The offset for `proper`
    @param   yn:int      The length, before applying `y0`, of `proper`
    @param   used:str    The incorrect word
    @param   x0:int      The offset for `used`
    @param   xn:int      The length, before applying `x0`, of `used`
    @return  :float      The distance between the words
    '''
    def __distance(self, proper, y0, yn, used, x0, xn):
        my = self.M[y0]
        for y in range(y0, yn):
            best = 0x7FFFFFFF
            p = proper[y]
            myy = self.M[y + 1] # only one array bound check, and at most one + ☺
            x = x0
            while x < xn:
                change = my[x]
                u = used[x]
                if p == u:
                    # commence black magick … twilight would be so disappointed
                    x += 1
                    myy[x] = change
                    best = min(best, change)
                remove = myy[x]
                add = my[x + 1]
                
                cw = 1
                if my[x] in self.weights:
                    if p in self.weights[u]:
                      cw = self.weights[u][p]
                x += 1
                
                myy[x] = min(cw + change, 1 + min(remove, add))
                if best > myy[x]:
                    best = myy[x]
            
            if best > self.closestDistance:
                return 0x7FFFFF00 | y
            my = myy
        return my[xn]
splitting into multiple files Signed-off-by: Mattias Andrée <maandree@operamail.com> 2013-04-02 11:21:33 +02:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`'''`
			`ponysay - Ponysay, cowsay reimplementation for ponies`
			`Copyright (C) 2012, 2013 Erkin Batu Altunbaş et al.`

			`This program is free software. It comes without any warranty, to`
			`the extent permitted by applicable law. You can redistribute it`
			`and/or modify it under the terms of the Do What The Fuck You Want`
			`To Public License, Version 2, as published by Sam Hocevar. See`
			`http://sam.zoy.org/wtfpl/COPYING for more details.`
			`'''`
			`from common import *`



			`'''`
			`Class used for correcting spellos and typos,`

			`Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.`
			`It is also limited to words of size 0 to 127 (inclusive)`
			`'''`
			`class SpelloCorrecter(): # Naïvely and quickly ported and adapted from optimised Java, may not be the nicest, or even fast, Python code`
			`'''`
			`Constructor`

			`@param directories:list<str> List of directories that contains the file names with the correct spelling`
			`@param ending:str The file name ending of the correctly spelled file names, this is removed for the name`
			`'''`
			`def __init__(self, directories, ending):`
			`self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},`
			`'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},`
			`'s' : {'z' : 0.25, 'c' : 0.5},`
			`'z' : {'s' : 0.25, 'c' : 0.5},`
			`'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},`
			`'o' : {'u' : 0.5},`
			`'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},`
			`'b' : {'v' : 0.75},`
			`'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},`
			`'w' : {'v' : 0.5, 'u' : 0.5},`
			`'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}`

			`self.corrections = None`
			`self.dictionary = [None] * 513`
			`self.reusable = [0] * 512`
			`self.dictionaryEnd = 512`
			`self.closestDistance = 0`

			`self.M = [None] * 128`
			`for y in range(0, 128):`
			`self.M[y] = [0] * 128`
			`self.M[y][0] = y`
			`m0 = self.M[0]`
			`x = 127`
			`while x > -1:`
			`m0[x] = x`
			`x -= 1`

			`previous = ''`
			`self.dictionary[-1] = previous;`

			`for directory in directories:`
			`for filename in os.listdir(directory):`
			`if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):`
			`continue`
			`proper = filename[:-len(ending)]`

			`if self.dictionaryEnd == 0:`
			`self.dictionaryEnd = len(self.dictionary)`
			`self.reusable = [0] * self.dictionaryEnd + self.reusable`
			`self.dictionary = [None] * self.dictionaryEnd + self.dictionary`

			`self.dictionaryEnd -= 1`
			`self.dictionary[self.dictionaryEnd] = proper`

			`prevCommon = min(len(previous), len(proper))`
			`for i in range(0, prevCommon):`
			`if previous[i] != proper[i]:`
			`prevCommon = i`
			`break`
			`previous = proper`
			`self.reusable[self.dictionaryEnd] = prevCommon`
			`#part = self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1]`
			`#part.sort()`
			`#self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1] = part`
			`#`
			`#index = len(self.dictionary) - 1`
			`#while index >= self.dictionaryEnd:`
			`# proper = self.dictionary[index]`
			`# prevCommon = min(len(previous), len(proper))`
			`# for i in range(0, prevCommon):`
			`# if previous[i] != proper[i]:`
			`# prevCommon = i`
			`# break`
			`# previous = proper`
			`# self.reusable[self.dictionaryEnd] = prevCommon`
			`# index -= 1;`


			`'''`
			`Finds the closests correct spelled word`

			`@param used:str The word to correct`
			`@return (words, distance):(list<string>, int) A list the closest spellings and the weighted distance`
			`'''`
			`def correct(self, used):`
			`if len(used) > 127:`
			`return ([used], 0)`

			`self.__correct(used)`
			`return (self.corrections, self.closestDistance)`


			`'''`
			`Finds the closests correct spelled word`

			`@param used:str The word to correct, it must satisfy all restrictions`
			`'''`
			`def __correct(self, used):`
			`self.closestDistance = 0x7FFFFFFF`
			`previous = self.dictionary[-1]`
			`prevLen = 0`
			`usedLen = len(used)`

			`proper = None`
			`prevCommon = 0`

			`d = len(self.dictionary) - 1`
			`while d > self.dictionaryEnd:`
			`d -= 1`
			`proper = self.dictionary[d]`
			`if abs(len(proper) - usedLen) <= self.closestDistance:`
			`if previous == self.dictionary[d + 1]:`
			`prevCommon = self.reusable[d];`
			`else:`
			`prevCommon = min(prevLen, len(proper))`
			`for i in range(0, prevCommon):`
			`if previous[i] != proper[i]:`
			`prevCommon = i`
			`break`

			`skip = min(prevLen, len(proper))`
			`i = prevCommon`
			`while i < skip:`
			`for u in range(0, usedLen):`
			`if (used[u] == previous[i]) or (used[u] == proper[i]):`
			`skip = i`
			`break`
			`i += 1`

			`common = min(skip, min(usedLen, len(proper)))`
			`for i in range(0, common):`
			`if used[i] != proper[i]:`
			`common = i`
			`break`

			`distance = self.__distance(proper, skip, len(proper), used, common, usedLen)`

			`if self.closestDistance > distance:`
			`self.closestDistance = distance`
			`self.corrections = [proper]`
			`elif self.closestDistance == distance:`
			`self.corrections.append(proper)`

			`previous = proper;`
			`if distance >= 0x7FFFFF00:`
			`prevLen = distance & 255`
			`else:`
			`prevLen = len(proper)`


			`'''`
			`Calculate the distance between a correct word and a incorrect word`

			`@param proper:str The correct word`
			@param y0:int The offset for `proper`
			@param yn:int The length, before applying `y0`, of `proper`
			`@param used:str The incorrect word`
			@param x0:int The offset for `used`
			@param xn:int The length, before applying `x0`, of `used`
			`@return :float The distance between the words`
			`'''`
			`def __distance(self, proper, y0, yn, used, x0, xn):`
			`my = self.M[y0]`
			`for y in range(y0, yn):`
			`best = 0x7FFFFFFF`
			`p = proper[y]`
			`myy = self.M[y + 1] # only one array bound check, and at most one + ☺`
			`x = x0`
			`while x < xn:`
			`change = my[x]`
			`u = used[x]`
			`if p == u:`
			`# commence black magick … twilight would be so disappointed`
			`x += 1`
			`myy[x] = change`
			`best = min(best, change)`
			`remove = myy[x]`
			`add = my[x + 1]`

			`cw = 1`
			`if my[x] in self.weights:`
			`if p in self.weights[u]:`
			`cw = self.weights[u][p]`
			`x += 1`

			`myy[x] = min(cw + change, 1 + min(remove, add))`
			`if best > myy[x]:`
			`best = myy[x]`

			`if best > self.closestDistance:`
			`return 0x7FFFFF00 \| y`
			`my = myy`
			`return my[xn]`