admin 管理员组

文章数量: 1087139

recode

知识图谱-命名实体-关系-免费标注工具-快速打标签-Python3 ()

文章中的recode_2.py文件,更新代码!

# -*- coding: utf-8 -*-
"""
2023/5/18 更新
CHEN | YNU
"""#-----只需在这里修改 文件地址、 数字 即可↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓filea = r'C:\Users\DELL\Desktop\测试数据\原始文本-noneuser-03-14-17-03-50_分段_03-14-17-09-27.anns'# 导出原始五元组 : keytime = 1
# 导出一对一标签 : keytime = 2
# 导出json格式7元组: keytime = 3
# 导出json格式文件: keytime = 4keytime = 1#-----只需在这里修改 文件地址、数字 即可↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑import re
from datetime import datetimedef sanyuanzu1(entuty_list): #5元素元组entity_dict = {}entity_lists = []i = 0for entii in entuty_list:if ("O" not in entii) and (len(entii) > 2): #带有关系的实体行i += 1if len(entii) > 4: #多个关系entii.insert(0, i)j = -1# print(2)while True:# print(1)try:j += 2lin = entii[:3]m = 2+jmm = 4+jentii[m]lin.extend(entii[m: mm])entity_lists.append(lin)except:breakelif len(entii) == 4: #一个关系entii.insert(0, i)entity_lists.append(entii)   else:# print(entii)passreturn  wordtoch(sanyuanzu2(entity_lists))def sanyuanzu1ooo(entuty_list): #5元素元组entity_dict = {}entity_lists = []i = 0for entii in entuty_list:if ("O" not in entii) and (len(entii) > 2): #带有关系的实体行i += 1if len(entii) > 4: #多个关系entii.insert(0, i)j = -1# print(2)while True:# print(1)try:j += 2lin = entii[:3]m = 2+jmm = 4+jentii[m]lin.extend(entii[m: mm])entity_lists.append(lin)except:breakelif len(entii) == 4: #一个关系entii.insert(0, i)entity_lists.append(entii)   else:# print(entii)passreturn  wordtochooo(sanyuanzu2(entity_lists))def sanyuanzu2(entity_lists):entitysan_list = []for index, relation in  enumerate(entity_lists):for relation2 in entity_lists[index+1:]:# print(index, relation2)if relation[3] == relation2[3]:# print(relation[3])# passif "1" in relation[-1]:entitysan_list.append([relation[1],relation[2],relation[-1],relation2[2],relation2[1]])else:entitysan_list.append([relation2[1],relation2[2],relation2[-1],relation[2],relation[1]])break #break 帮助在打标签时,能够断断续续多次标注。|匹配到最近的一对实体后,不再继续寻找。return entitysan_listdef wordtoch(words):# zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']# en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']# ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]en = ['dis','hyp','oth','med','dia','cur',   "none",   'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]ti = ["A","B","C","D","E","F", "Q",  "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]dic1 = dict(zip(ti, zh)) # dic2 = dict(zip(en,ti ))dic3 = dict(zip(ti,en))# print(len(zh))# print(len(en))# print(len(ti))for word in words:try:word[1] = dic1[word[1]]word[2] = dic1[word[2][0]]word[3] = dic1[word[3]]except:continuenow_time = datetime.now().strftime('%m-%d-%H-%M-%S')new_filename = file_name[:-5] + '_五元组_' +now_time +'.csv'filew = f = open(new_filename, 'w', encoding="utf-8")for word in words:# print(word)filew.write(str(word).strip('[').strip(']') +'\n')filew.close()print('5元组文件已导出!')return wordsdef wordtochooo(words):# zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']# en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']# ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]en = ['dis','hyp','oth','med','dia','cur',   "none",   'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]ti = ["A","B","C","D","E","F", "Q",  "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]dic1 = dict(zip(ti, zh)) # dic2 = dict(zip(en,ti ))dic3 = dict(zip(ti,en))# print(len(zh))# print(len(en))# print(len(ti))for word in words:try:word[1] = dic1[word[1]]word[2] = dic1[word[2][0]]word[3] = dic1[word[3]]except:continuereturn wordsdef readfile(file):f = open(file, "r", encoding='utf-8').readlines()entuty_list = []for i in f:# print(i.strip('\n'))j = i.strip('\n')# print(j)j = re.split(" |@|_", j)# print(j)entuty_list.append(j)# print(entuty_list)return entuty_list#打标签
def tag_entity(word_list, label, schema='BIEO' ):"""将实体字列表(word_list)中的每个字按照给定的模式(schema)打上对应的标签(label):param word_list: 将实体词拆成单字组成的列表:param label: 实体对应的标签:param schema: 标注方法:return:"""output_list = []list_len = len(word_list)if list_len == 1: #单字符if schema == 'BIEO':return word_list[0] + ' ' + 'B-' + label + '\n'else:  #'BI' return word_list[0] + ' ' + 'B-' + label + '\n'else:if schema == 'BIEO':for idx in range(list_len):if idx == 0:pair = word_list[idx] + ' ' + 'B-' + label + '\n'elif idx == list_len - 1:pair = word_list[idx] + ' ' + 'E-' + label + '\n'else:pair = word_list[idx] + ' ' + 'I-' + label + '\n'output_list.append(pair)else: #'BI'for idx in range(list_len):if idx == 0:pair = word_list[idx] + ' ' + 'B-' + label + '\n'else:pair = word_list[idx] + ' ' + 'I-' + label + '\n'output_list.append(pair)return output_listdef biaoqian(file_list):# zh = ['病名','病症','其它','药名','诊断方案','治疗方案', "取消标注",'包含','治疗','危险因素','辅助诊断','特征','并发','别名','作用','条件','诊断']# en = ['dis','hyp','oth','med','dia','cur',"none", 'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag']# ti = ["A","B","C","D","E","F","Q","I","T","K","U","M","N","L","J","Y","G"]zh = ['指标','程度','动作','场设线','故障影响','调控中心' ,"取消标注",'对应指标','对应程度','对应动作','对应编号','故障影响位置','具体影响为','对应调控中心','控制目标','稳定要求限制目标','具体要求为',"故障编号","预想故障","稳定要求"]en = ['dis','hyp','oth','med','dia','cur',   "none",   'Incl','Trea','Risk','Auxi','Char','Conc','Alia','Acti','Cond','Diag',"zzz", "xxx", "www"]ti = ["A","B","C","D","E","F", "Q",  "I","T","K","U","M","N","L","J","Y","G","Z", "X", "W"]dic1 = dict(zip(ti, zh)) # dic2 = dict(zip(en,ti ))dic3 = dict(zip(ti,en))for entii in file_list:if ("O" in entii) or (("O" not in entii) and (len(entii) == 2)) :klist = [k+' '+'O\n' for k in entii[0]]ms_list.append(klist) #存储字符和标签elif ("O" not in entii) and (len(entii) > 2):if len(entii) == 4:word_list = list(entii[0])label = dic3[entii[-1][0]] + '-' + entii[-1][-1]ms_list.append(tag_entity(word_list, label))elif len(entii) > 4:word_list = list(entii[0])label = "main-1"ms_list.append(tag_entity(word_list, label))else:passelse:passdef writefile(ms_list):now_time = datetime.now().strftime('%m-%d-%H-%M-%S')new_filename = file_name[:-5] + '_一对一_' +now_time +'.anns'f = open(new_filename, 'w', encoding='utf-8')for i in ms_list:for j in i:if '。' in j:f.write(j+'\n')else:f.write(j)f.close()print("已经输出ann文件!")def Soooooda(entuty_list): # 句子分段,再逐一段传给下游任务juzilist = [] # 将同一个句子的元素放在一起juziid = 0 # 句子的id,防止同样的句子出现混乱result_list = [] #输出结果listjuzi = ''for idx, entii in enumerate(entuty_list):aidx = len(entuty_list)  if (len(entii) == 1) and (len(entii[0])==0):juziid += 1a = sanyuanzu1ooo(juzilist)result_list.append([juziid, juzi, a])juzi = ''juzilist = []elif (len(entii) > 1) and ((idx+1) < aidx):juzi += entii[0]juzilist.append(entii)elif (idx+1) == aidx:juziid += 1juzi += entii[0]juzilist.append(entii)a = sanyuanzu1ooo(juzilist)result_list.append([juziid, juzi, a])else:pass# print("这儿")# print(result_list)return result_listdef write_Soooooda(result_list):now_time = datetime.now().strftime('%m-%d-%H-%M-%S')new_filename = file_name[:-5] + '_7元组_' +now_time +'.csv'f = open(new_filename, 'w', encoding='utf-8')for i_list in result_list:if len(i_list[-1]) >= 1:for wordlist in i_list[-1]:f.write(str(i_list[0])+","+ i_list[1].replace(",",",") +","+ str(wordlist).strip('[').strip(']') +'\n')else:passf.close()print("已输出7元组文件")return # '''
# s = [
#     {"text": "查尔斯·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部",
#         "new_spo_list": [#             {"s": {"entity": "查尔斯·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}},#              {"s": {"entity": "查尔斯·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}}
#         ]
#     },
#     {"text": "查尔斯1·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部",
#         "new_spo_list": 
#     }
# ]
# ''' import jsondef spo(alis):# ['保质期长', '故障编号', '故障影响位置', '稳定要求', '经济价值']'''{"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},"p": {"entity": "出生地","type": "_rel"},"o": {"entity": "圣地亚哥","type": "property"}}'''stri = {"s": {"entity":alis[0], "type":alis[1]}, "p":{"entity":alis[2], "type": "_rel"}, "o":{"entity":alis[4],"type":alis[3]}}return stridef new_spo_list(i_lista):# '''
#  [
#     ['云南牛干巴', '指标', '具体要求为', '程度', '云南省回族人民'], 
#     ['传统干腌肉制品', '动作', '对应指标', '指标', '牛干巴'], 
#     ['保质期长', '故障编号', '故障影响位置', '动作', '易于携带'], 
#     ['保质期长', '故障编号', '故障影响位置', '稳定要求', '经济价值']
# ]
# '''# [      
#             {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}},#              {"s": {"entity": "查尔斯1·阿兰基斯","type": "people"},
#             "p": {"entity": "出生地","type": "_rel"},
#             "o": {"entity": "圣地亚哥","type": "property"}}
#         ]jj = []for ij in i_lista:jj.append(spo(ij))return jjdef Soooooda_json(Soooooda_list):result = Soooooda_listresult_new = []list_wai = []for i_list in result:if len(i_list[-1]) >= 1:result_new.append(i_list)else:passfor i_lista in result_new: list_u = {"text": i_lista[1], "new_spo_list": new_spo_list(i_lista[-1])}list_wai.append(list_u)now_time = datetime.now().strftime('%m-%d-%H-%M-%S')new_filename = file_name[:-5] + '_json_' +now_time +'.txt'f = open(new_filename, 'w', encoding='utf-8')wri = str(list_wai).replace("'", '"')f.write(wri)f.close()print("json-txt写入完成")return wriif __name__ == "__main__":'''file_list = readfile(file_name)sanyuanzu1(file_list) #输出三元组-五元组文件'''schema="BIEO" rep=r'\[<.*?\⊙'file_name = fileams_list = []file_list = readfile(file_name)file_wlist = readfile(file_name)Soooooda_list = readfile(file_name)# print(file_list)if keytime == 1:sanyuanzu1(file_list) #输出三元组-五元组文件elif keytime == 2:biaoqian(file_wlist) # 付式 输出标签文件writefile(ms_list)elif keytime == 3:   # Soooooda新增功能:1write_Soooooda(Soooooda(Soooooda_list))elif keytime == 4:   # Soooooda新增功能:2Soooooda_json(Soooooda(Soooooda_list))else:print(f"请修正keytime={keytime}为1、2、3 或 4, 并重新运行程序")

本文标签: recode