Python 最大概率法進行漢語切分的方法

2020-02-16 00:08:12

字體：大中小

來源：轉載

供稿：網友

要求：

1 采用基于語言模型的最大概率法進行漢語切分。

2 切分算法中的語言模型可以采用n-gram語言模型，要求n >1，并至少采用一種平滑方法；

代碼：

廢話不說，代碼是最好的語言

import reimport mathMAX_SPLITLEN = 4#最大切分長度corpus_lib = ''#corpus:語料def init_corpus_lib(path): # 初始化語料庫 global corpus_lib with open(path, 'r', encoding='utf-8', errors='ignore') as file:  corpus_lib = str(file.readlines())def get_candidate_words(sen): global MAX_SPLITLEN global corpus_lib candidate_words = [] for sp in range(len(sen)):  w = sen[sp]  candidate_words.append([w, sp, sp]) # 有些字可能不在語料庫中，把它作為單個字加進去  for mp in range(1, MAX_SPLITLEN): # 判斷1 ~ MAX_SPLITLEN-1這3種詞中是否有候選詞.   if sp + mp < len(sen):    w += sen[sp + mp]    if w in corpus_lib:     candidate_words.append([w, sp, sp + mp]) # 存儲詞，初始位置，結束位置 print('候選詞有：%s' % candidate_words) return candidate_wordsdef segment_sentence(sen): # sen:sentence即要切分的句子 global MAX_SPLITLEN global corpus_lib candidate_words = get_candidate_words(sen) count = 0 for word in candidate_words:  if count > 1000: # 為防止對長句子解析時間過長，放棄一部分精度追求效率   break  if word[1] == 0 and word[2] != len(sen) - 1: # 如果句子中開頭的部分，還沒有拼湊成整個詞序列的話   no_whitespace_sen = ''.join(word[0].split())   for word in candidate_words: # word比如：['今天', 1, 2]，1是今在句子中的位置，2是天的位置    if word[1] == 0 and word[2] != len(sen) - 1:     end = word[2]     for later_word in candidate_words:      if later_word[1] == end + 1: # 如果later_word是當前詞的后續詞，那么拼接到當前詞上       word_seq = [word[0] + ' ' + later_word[0], word[1], later_word[2]] # 合并       candidate_words.append(word_seq)       # print('拼出了新詞：%s' % word_seq)       count += 1     candidate_words.remove(word) # 遍歷完后，這個開頭部分短語要移除掉，不然下次遍歷還會對它做無用功 print('所有結果詞序列有：%s' % candidate_words) word_segment_res_list = [] # 存儲分詞結果序列 for seque in candidate_words:  if seque[1] == 0 and seque[2] == len(sen) - 1:   word_segment_res_list.append(seque[0]) print('獲得的所有分詞結果是：') print(word_segment_res_list) return word_segment_res_list# P(w1,w2,...,wn) = P(w1/start)P(w2/w1)P(w3/w2).....P(Wn/Wn-1)# 下標從0開始： = P(w0/start)P(w1/w0)...P(Wn-1/Wn-2)def calculate_word_sequence_probability(sequence): global corpus_lib word_list = sequence.split(' ') total_word_num = len(corpus_lib) prob_total = 0.0 word_start = word_list[0] # 計算第一個詞出現的概率P(w1/start)=Count(w1)/total count = len(re.findall(r'/s' + word_start + r'/s', corpus_lib)) + 1 # 加1平滑 prob_total += math.log(count / total_word_num) # 計算P(w2/w1)P(w3/w2).....P(Wn/Wn-1) for i in range(len(word_list) - 1): # 0~ n-2  prev_w = word_list[i]  later_w = word_list[i + 1]  count = len(re.findall(r'/s' + prev_w + r'/s' + later_w + r'/s', corpus_lib))  count += 1 # 做一次加1平滑  prob_total += math.log(count / total_word_num) print('%s的概率是：' % sequence) print(prob_total) return prob_totaldef calculate_biggest_prob(word_segm_res): best_w_s = '' max_prob = 0.0 for w_s in word_segm_res: # 改進：先只計算詞的數目<=0.6 句子字數的，如果不行再計算全部的概率  no_whitespace_sen = ''.join(w_s.split())  zi_shu = len(no_whitespace_sen)  if len(w_s.split(' ')) <= zi_shu * 0.6:   prob = calculate_word_sequence_probability(w_s)   if max_prob == 0 or max_prob < prob:    best_w_s = w_s    max_prob = prob  if best_w_s == '': # 如果上面的0.6不行的話，再計算全部的概率   prob = calculate_word_sequence_probability(w_s)   if max_prob == 0 or max_prob < prob:    best_w_s = w_s    max_prob = prob print('最好的分詞結果（概率為%s）是 ：%s' % (math.pow(math.e, max_prob), best_w_s)) return best_w_sdef split_middle(sen_to_segment): # 從中間切分一下，返回中間切分的位置 length = len(sen_to_segment) start = int(length / 2) - 2 end = start + 5 # 對中間的5個字進行切分，然后找第一個空格，按此把整個句子一分為二 middle_part = sen_to_segment[start:end] best_segm_res = calculate_biggest_prob(segment_sentence(middle_part)) return start + best_segm_res.index(' ') - 1def split_mark_and_too_long_sent(sentences): # 按任意標點符號劃分句子，對每個短句進行分詞 sen_list = sentences.splitlines() print(sen_list) out_text = '' for line in sen_list:  sen_to_segment = '' #  for single_char in line:   if single_char.isalpha(): # isalpha()表示是否是單詞，如果是單詞的為True，標點符號等為False    sen_to_segment += single_char   elif not single_char.isalpha() and sen_to_segment == '': # 如果single_char是標點符號、數字,且前面沒有待分詞的句子    out_text += single_char + ' '    print(single_char)   else: # 如果single_char是標點符號、數字,    # 如果句子太長，先從中間切分一下    if len(sen_to_segment) >= 20:     middle = split_middle(sen_to_segment)     left_half = sen_to_segment[0:middle + 1] # 左半部分     best_segm_res = calculate_biggest_prob(segment_sentence(left_half))     out_text += best_segm_res + ' '     sen_to_segment = sen_to_segment[middle + 1:len(sen_to_segment)] # 右半部分交給后面幾行處理    best_segm_res = calculate_biggest_prob(segment_sentence(sen_to_segment))    print(single_char)    sen_to_segment = ''    out_text += best_segm_res + ' ' + single_char + ' ' # 標點兩側也用空格隔起來  # 如果這行句子最后還有一些文字沒有切分的話  if sen_to_segment != '':   best_segm_res = calculate_biggest_prob(segment_sentence(sen_to_segment))   out_text += best_segm_res + ' '  out_text += '/n' with open('D:/1佩王的文件/計算語言學基礎/生成結果.txt','w') as file:  file.write(out_text) print(out_text)if __name__ == '__main__': path = 'D:/1佩王的文件/計算語言學基礎/北大(人民日報)語料庫199801.txt' init_corpus_lib(path)#初始化語料庫 sentences = '' path = 'E:/study/1.研一的課/計算語言學基礎課件/testset.txt'#讀取要切分的文章 with open(path, 'r', encoding='gbk', errors='ignore') as file:  for line in file.readlines():   sentences += line # 改進：先對句子按標點符號劃分成多個短句，然后對每個短句進行切分、計算概率 split_mark_and_too_long_sent(sentences)

上一篇：Django之Mode的外鍵自關聯和引用未定義的Model方法

下一篇：python模糊圖片過濾的方法