ponysay/src/spellocorrecter.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'''
ponysay - Ponysay, cowsay reimplementation for ponies
Copyright (C) 2012, 2013  Erkin Batu Altunbaş et al.

This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://sam.zoy.org/wtfpl/COPYING for more details.
'''
from common import *


'''
Class used for correcting spellos and typos,

Note that this implementation will not find that correctly spelled word are correct faster than it corrects words.
It is also limited to words of size 0 to 127 (inclusive)
'''
class SpelloCorrecter(): # Naïvely and quickly ported and adapted from optimised Java, may not be the nicest, or even fast, Python code
    '''
    Constructor

    @param  directories:list<str>  List of directories that contains the file names with the correct spelling
    @param  ending:str             The file name ending of the correctly spelled file names, this is removed for the name
    '''
    def __init__(self, directories, ending):
        self.weights = {'k' : {'c' : 0.25, 'g' : 0.75, 'q' : 0.125},
                        'c' : {'k' : 0.25, 'g' : 0.75, 's' : 0.5, 'z' : 0.5, 'q' : 0.125},
                        's' : {'z' : 0.25, 'c' : 0.5},
                        'z' : {'s' : 0.25, 'c' : 0.5},
                        'g' : {'k' : 0.75, 'c' : 0.75, 'q' : 0.9},
                        'o' : {'u' : 0.5},
                        'u' : {'o' : 0.5, 'v' : 0.75, 'w' : 0.5},
                        'b' : {'v' : 0.75},
                        'v' : {'b' : 0.75, 'w' : 0.5, 'u' : 0.7},
                        'w' : {'v' : 0.5, 'u' : 0.5},
                        'q' : {'c' : 0.125, 'k' : 0.125, 'g' : 0.9}}

        self.corrections = None
        self.dictionary = [None] * 513
        self.reusable = [0] * 512
        self.dictionaryEnd = 512
        self.closestDistance = 0

        self.M = [None] * 128
        for y in range(0, 128):
            self.M[y] = [0] * 128
            self.M[y][0] = y
        m0 = self.M[0]
        x = 127
        while x > -1:
            m0[x] = x
            x -= 1

        previous = ''
        self.dictionary[-1] = previous;

        for directory in directories:
            for filename in os.listdir(directory):
                if (not endswith(filename, ending)) or (len(filename) - len(ending) > 127):
                    continue
                proper = filename[:-len(ending)]

                if self.dictionaryEnd == 0:
                    self.dictionaryEnd = len(self.dictionary)
                    self.reusable = [0] * self.dictionaryEnd + self.reusable
                    self.dictionary = [None] * self.dictionaryEnd + self.dictionary

                self.dictionaryEnd -= 1
                self.dictionary[self.dictionaryEnd] = proper

                prevCommon = min(len(previous), len(proper))
                for i in range(0, prevCommon):
                    if previous[i] != proper[i]:
                        prevCommon = i
                        break
                previous = proper
                self.reusable[self.dictionaryEnd] = prevCommon
        #part = self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1]
        #part.sort()
        #self.dictionary[self.dictionaryEnd : len(self.dictionary) - 1] = part
        #
        #index = len(self.dictionary) - 1
        #while index >= self.dictionaryEnd:
        #    proper = self.dictionary[index]
        #    prevCommon = min(len(previous), len(proper))
        #    for i in range(0, prevCommon):
        #        if previous[i] != proper[i]:
        #            prevCommon = i
        #            break
        #    previous = proper
        #    self.reusable[self.dictionaryEnd] = prevCommon
        #    index -= 1;


    '''
    Finds the closests correct spelled word

    @param   used:str                               The word to correct
    @return  (words, distance):(list<string>, int)  A list the closest spellings and the weighted distance
    '''
    def correct(self, used):
        if len(used) > 127:
            return ([used], 0)

        self.__correct(used)
        return (self.corrections, self.closestDistance)


    '''
    Finds the closests correct spelled word

    @param  used:str  The word to correct, it must satisfy all restrictions
    '''
    def __correct(self, used):
        self.closestDistance = 0x7FFFFFFF
        previous = self.dictionary[-1]
        prevLen = 0
        usedLen = len(used)

        proper = None
        prevCommon = 0

        d = len(self.dictionary) - 1
        while d > self.dictionaryEnd:
            d -= 1
            proper = self.dictionary[d]
            if abs(len(proper) - usedLen) <= self.closestDistance:
                if previous == self.dictionary[d + 1]:
                    prevCommon = self.reusable[d];
                else:
                    prevCommon = min(prevLen, len(proper))
                    for i in range(0, prevCommon):
                        if previous[i] != proper[i]:
                            prevCommon = i
                            break

                skip = min(prevLen, len(proper))
                i = prevCommon
                while i < skip:
                    for u in range(0, usedLen):
                        if (used[u] == previous[i]) or (used[u] == proper[i]):
                            skip = i
                            break
                    i += 1

                common = min(skip, min(usedLen, len(proper)))
                for i in range(0, common):
                    if used[i] != proper[i]:
                        common = i
                        break

                distance = self.__distance(proper, skip, len(proper), used, common, usedLen)

                if self.closestDistance > distance:
                    self.closestDistance = distance
                    self.corrections = [proper]
                elif self.closestDistance == distance:
                    self.corrections.append(proper)

                previous = proper;
                if distance >= 0x7FFFFF00:
                    prevLen = distance & 255
                else:
                    prevLen = len(proper)


    '''
    Calculate the distance between a correct word and a incorrect word

    @param   proper:str  The correct word
    @param   y0:int      The offset for `proper`
    @param   yn:int      The length, before applying `y0`, of `proper`
    @param   used:str    The incorrect word
    @param   x0:int      The offset for `used`
    @param   xn:int      The length, before applying `x0`, of `used`
    @return  :float      The distance between the words
    '''
    def __distance(self, proper, y0, yn, used, x0, xn):
        my = self.M[y0]
        for y in range(y0, yn):
            best = 0x7FFFFFFF
            p = proper[y]
            myy = self.M[y + 1] # only one array bound check, and at most one + ☺
            x = x0
            while x < xn:
                change = my[x]
                u = used[x]
                if p == u:
                    # commence black magick … twilight would be so disappointed
                    x += 1
                    myy[x] = change
                    best = min(best, change)
                remove = myy[x]
                add = my[x + 1]

                cw = 1
                if my[x] in self.weights:
                    if p in self.weights[u]:
                      cw = self.weights[u][p]
                x += 1

                myy[x] = min(cw + change, 1 + min(remove, add))
                if best > myy[x]:
                    best = myy[x]

            if best > self.closestDistance:
                return 0x7FFFFF00 | y
            my = myy
        return my[xn]