贝叶斯原理做最大似然估计实现拼写纠错

2022-07-31,,,,

import numpy as np
import jieba 
import pandas as pd
import re
from itertools import permutations
import time
from sys import exc_info,stdout
import traceback
from os import getcwd
import os


def shorterror(func):
    def In(*vars):
        try :
            return func (*vars),
        except Exception as e :
            exc_type, exc_value, exc_traceback_obj = exc_info()
            traceback.print_exception(exc_type, exc_value, exc_traceback_obj, limit=2, file=stdout)
            print("exc_type: %s" % exc_type)
            print("exc_value: %s" % exc_value)
            print("exc_traceback_obj: %s" % exc_traceback_obj)
    return In

def longerror(func):
    def In(*vars):
        try :
            return func (*vars),
        except Exception as e :
            import cgitb
            cgitb.enable ( format = 'text' )
        return func (*vars),
    return In

def calltime(func):
    def In(*varc):
        start = time.process_time()
        func(*varc)
        print('The Function',func.__name__,'Takes Time To Run :',time.process_time() - start,'Seconds')
        return func(*varc)
    return In



symbles=''':,"{[}](>)</\n。●  ,、的 啊 好 和
并 与 及 对 错 你 我 我们 她 他 它:: ; ;《 》
1 2 3 4 5 6 7 8 9 0  ‘ “ ” ’ + - * / ` ~ 
\( \ [ \ { \ } ] ) ( )【 \xa0 】理想 愿景
工 不管 只要 一员 大家庭 当成 作 帅哥 美女 年轻
佛系
'''
#删除停词
def del_stop_word(strings,symbles=symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)
    
#读取文档
def read_txt(path):
    return open(path,'r').read()
 
#只要中文
def just_chinese(string, resymbol=""):
    return re.sub(u"([^\u4e00-\u9fa5])", resymbol, string)

#分词
def split_world(corpus):
    return np.array(list(jieba.cut(just_chinese(read_txt(corpus)))))

#整理成词典
def word_dict_func(corpus,log=False):
    word_list = split_world(corpus)
    m = np.count_nonzero(word_list)
    kind,count = np.unique(word_list,return_counts=True)
    if log:
        prob = -np.log(count/m)
    else:
        prob = count/m
    return dict(zip(kind,prob))


# 加载本地词典
#只要数字
def just_number(string, resymbol=""):
    sub_str = re.sub(u"([^\u0030-\u0039])", resymbol, string)
    return sub_str

#sigmod预留函数,转概率空前备用
def sigmod(z):
    return 1/(1+np.exp(-z))

#过滤字词语频率生成字典
def filter_dict(words,numbers):
    word_dict ,expr_sum = dict(),0
    for word,num in zip(words,numbers):
        try:
            number = float(just_number(num))
            expr = {word:number}  
        except Exception:
            number = 0
        finally:
            expr_sum += number
            word_dict.update(expr)
    prob = {word:word_dict[word]/expr_sum for word in word_dict}
    return prob

#加载本地字典入口函数
def location_dict(dir_path):
    init_dict = np.zeros((2,2))
    for path in os.listdir(dir_path):
        try:
            file_path = "{}{}".format(dir_path,path)
            file_of_one = np.loadtxt(file_path,delimiter='\t',dtype=str)
            print("ok:",path)
        except Exception as error:
            file_of_one = np.array([line.replace("\n","").split("\t") for line in open(file_path,'r').readlines()])
            print("error:",path,error)
        finally :
            init_dict = np.r_[init_dict,file_of_one]
    words,numbers = init_dict[2:,0],init_dict[2:,1]
    return filter_dict(words,numbers)



#生成单词补充模块
#创建补充单词字典
def create_char_map(str_range = 'lowercase',chinese_path = False):
    iter_range = lambda char_range : map(lambda x : chr(x),char_range)
    func_dict = {'lowercase':iter_range(range(97,122))
                ,'uppercase':iter_range(range(65,90))
                ,'numbers':iter_range(range(48,57))
                ,'chinese':tuple(set(just_chinese(read_txt(corpus))))}        
    return func_dict[str_range]

#展开拼接字符组合
def collate_char_iterator(itertools_perm):
    return map(lambda x:"".join(x),itertools_perm)

#字符生成器
def chargen(language="lowercase",n=1):
    return collate_char_iterator(permutations(create_char_map(language),n))

#编辑距离添加
def add_char(input_char,language="lowercase",n=2,forward=True):
    return ("{}{}".format(char,input_char) 
        if forward==True else "{}{}".format(input_char,char) 
        for char in chargen(language=language,n=n))
#编辑距离替换
def replace_char(input_char,language="lowercase",n=2):
    m = len(input_char)
    S = chargen(language=language,n=n)
    for create_str in S:
        for i in range(m):
            result = yield input_char.replace(input_char[i:i+n],create_str)

#批量字符串删除函数
def delete_element(strings,symbles=symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

#编辑距离删除
def delete_char(input_char,language="lowercase",n=2):
    return (delete_element(input_char,"".join(chars)) for chars in permutations(input_char,n))

#编辑距离生成
def translation_str(input_char,language="lowercase",n=2):
    del_ = delete_char(input_char,language=language,n=n)
    replace_ = replace_char(input_char,language=language,n=n)
    add_forward = add_char(input_char,language=language,n=n,forward=True)
    add_backward = add_char(input_char,language=language,n=n,forward=False)
    return tuple(list(del_)+list(replace_)+list(add_forward)+list(add_backward))

#批量编辑距离生成
def translation_n(input_char,language="lowercase",n=2):
    result = []
    for i in range(1,n+1):
        result += list(translation_str(input_char,language=language,n=i))
    return np.array(result)


#拼写纠错模块
@calltime
def check_str(input_char,word_dict=False,error_dict=False):
    prob_dict = dict()
    if word_dict:
        word_dict = word_dict
    else:
        word_dict = word_dict_func(corpus,log=False)
    if input_char in word_dict:
        check = filter(lambda word : len(word) > 0,translation_n(input_char,language="chinese",n=1))
        Pc = word_dict[input_char] 
        for sc_element in check:
            if sc_element in error_dict:
                Psc = error_dict[sc_element]
                bayes_ = Psc*Pc
                expr = { bayes_ : sc_element }
                prob_dict.update(expr)
        Eword = prob_dict[max(prob_dict)]
        return {"EM":Eword,"D":prob_dict,"C":Pc,"bayes":bayes_}
    return input_char
    

# 静态配置项
corpus = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/豆瓣电影数据集(2019.3)/豆瓣电影简介.txt"
dir_path = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/词库/chinese/"
example_error = location_dict(dir_path)
example_error
word_dict = word_dict_func(corpus,log=False)

# 测试运行
test = check_str(input(),word_dict,example_error)
EMword , D , C , bayes = test["EM"],test['D'],test['C'],test['bayes']
EMword 


本文地址:https://blog.csdn.net/weixin_43069769/article/details/107655775

《贝叶斯原理做最大似然估计实现拼写纠错.doc》

下载本文的Word格式文档,以方便收藏与打印。