import numpy as np
import json
import codecs
# 计算编辑距离
def edit_distance(word1, word2):
len1 = len(word1)
len2 = len(word2)
dp = np.zeros((len1 + 1, len2 + 1))
for i in range(len1 + 1):
dp[i][0] = i
for j in range(len2 + 1):
dp[0][j] = j
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
if word1[i - 1] == word2[j - 1]:
temp = 0
else:
temp = 1
dp[i][j] = min(dp[i - 1][j - 1] + temp, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
return dp[len1][len2]
# 190801
# 根据编辑距离计算相似度
def simility(word1, word2):
res = edit_distance(word1, word2)
maxLen = max(len(word1), len(word2))
return 1-res*1.0/maxLen
bianhaos = []
sub_sens = []
with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write_sub.txt','r','utf8') as f:
for line in f:
# bianhao,sub_sen = line.split('<->')
# sub_sen = sub_sen.strip().strip('<b>').strip('<e>')
# bianhaos.append(bianhao)
sub_sens.append(line)
count = len(sub_sens)
leibie = [-1]*count
cla = 0
print(count)
for i in range(count):
if leibie[i] != -1:
continue
leibie[i] = cla
sub1 = sub_sens[i]
for j in range(count):
if leibie[j] != -1:
continue
sub2 = sub_sens[j]
sim = simility(sub1,sub2)
if sim >= 0.5:
leibie[j] = cla
cla = cla + 1
print(i)
print(leibie)
with open('leibie05.json','w') as f:
json.dump(leibie,f)
知识兔