【NLP_命名实体识别】Bert/Albert+CRF模型实现
发布日期:2021-06-29 02:15:36
浏览次数:2
分类:技术文章
本文共 15271 字,大约阅读时间需要 50 分钟。
模块调用
2021/3/8 周一:基于模块调用部分(如下)bug,重装Anaconda与Tensorflow,解决bug。
import numpy as npfrom bert4keras.backend import keras, Kfrom bert4keras.models import build_transformer_modelfrom bert4keras.tokenizers import Tokenizerfrom bert4keras.optimizers import Adamfrom bert4keras.snippets import sequence_padding, DataGeneratorfrom bert4keras.snippets import open, ViterbiDecoderfrom bert4keras.layers import ConditionalRandomFieldfrom keras.layers import Densefrom keras.models import Modelfrom tqdm import tqdmfrom tensorflow import ConfigProtofrom tensorflow import InteractiveSession#上述两句中的“tensorflow”原为 tensorflow.compat.v1
报错原因与解决方案:1.Anaconda内置的Python版本与Tensorflow版本不一致——未注意到Tensorflow不太适合Python3.7以上的版本。重装内置3.6Python版本的Anaconda,再重装相应版本的Tensorflow,即可解决。2.安装Tensorflow-GPU多次,均以失败告终。原来,我的电脑的显卡配置并不支持GPU版本。
小结:有些时候,bug“缠身”,不妨直接卸载重装,可能更节省时间。
2021/3/9 周二:开组会,接受批评,反思自己。
保存、加载并使用已训练的Bert/Albert-CRF模型
2021/3/10:使用训练好的Bert/Albert-CRF模型,同时,在此基础上,加一层BiLSTM网络,得修改后的Albert-BiLSTM-CRF模型(见下一篇文章),开始训练。
'''if __name__ == '__main__': evaluator = Evaluate() train_generator = data_generator(train_data, batch_size) model.fit_generator( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator] )else: model.load_weights('best_model.weights')'''model.load_weights('best_model.weights')NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])ner=NER.recognize("我在厦门")print(ner)
流程:先训练模型,得训练好的权重文件,此时,也可同时得到训练好的模型文件。加载上述权重文件,再修改最后的函数调用部分即可。
注意:类的实例化。
Bert/Albert-CRF模型完整代码
仅训练与评估模型:
import numpy as npfrom bert4keras.backend import keras, Kfrom bert4keras.models import build_transformer_modelfrom bert4keras.tokenizers import Tokenizerfrom bert4keras.optimizers import Adamfrom bert4keras.snippets import sequence_padding, DataGeneratorfrom bert4keras.snippets import open, ViterbiDecoderfrom bert4keras.layers import ConditionalRandomFieldfrom keras.layers import Densefrom keras.models import Modelfrom tqdm import tqdmfrom tensorflow import ConfigProtofrom tensorflow import InteractiveSession#上述两句中的“tensorflow”原为 tensorflow.compat.v1config = ConfigProto()# config.gpu_options.per_process_gpu_memory_fraction = 0.2config.gpu_options.allow_growth = Truesession = InteractiveSession(config=config)maxlen = 256epochs = 1#10batch_size = 16bert_layers = 12learing_rate = 1e-5 # bert_layers越小,学习率应该要越大crf_lr_multiplier = 10 # 必要时扩大CRF层的学习率#1000# # bert配置# config_path = './bert_model/chinese_L-12_H-768_A-12/bert_config.json'# checkpoint_path = './bert_model/chinese_L-12_H-768_A-12/bert_model.ckpt'# dict_path = './bert_model/chinese_L-12_H-768_A-12/vocab.txt'#albert配置config_path = './bert_model/albert_large/albert_config.json'checkpoint_path = './bert_model/albert_large/model.ckpt-best'dict_path = './bert_model/albert_large/vocab_chinese.txt'def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: f = f.read() for l in f.split('\n\n'): if not l: continue d, last_flag = [], '' for c in l.split('\n'): char, this_flag = c.split(' ') if this_flag == 'O' and last_flag == 'O': d[-1][0] += char elif this_flag == 'O' and last_flag != 'O': d.append([char, 'O']) elif this_flag[:1] == 'B': d.append([char, this_flag[2:]]) else: d[-1][0] += char last_flag = this_flag D.append(d) return D# 标注数据train_data = load_data('./data/example.train')valid_data = load_data('./data/example.dev')test_data = load_data('./data/example.test')# 建立分词器tokenizer = Tokenizer(dict_path, do_lower_case=True)# 类别映射labels = ['PER', 'LOC', 'ORG']id2label = dict(enumerate(labels))label2id = {j: i for i, j in id2label.items()}num_labels = len(labels) * 2 + 1class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0] for w, l in item: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if l == 'O': labels += [0] * len(w_token_ids) else: B = label2id[l] * 2 + 1 I = label2id[l] * 2 + 2 labels += ([B] + [I] * (len(w_token_ids) - 1)) else: break token_ids += [tokenizer._token_end_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []"""后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:"""model = build_transformer_model( config_path, checkpoint_path, model='albert',)output_layer = 'Transformer-FeedForward-Norm'output = model.get_layer(output_layer).get_output_at(bert_layers - 1)# model = build_transformer_model(# config_path,# checkpoint_path,# )## output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)# output = model.get_layer(output_layer).outputoutput = Dense(num_labels)(output)CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)output = CRF(output)model = Model(model.input, output)model.summary()model.compile( loss=CRF.sparse_loss, optimizer=Adam(learing_rate), metrics=[CRF.sparse_accuracy])class NamedEntityRecognizer(ViterbiDecoder): """命名实体识别器 """ def recognize(self, text): tokens = tokenizer.tokenize(text) while len(tokens) > 512: tokens.pop(-2) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) nodes = model.predict([[token_ids], [segment_ids]])[0] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], id2label[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])def evaluate(data): """评测函数 """ X, Y, Z = 1e-10, 1e-10, 1e-10 for d in tqdm(data): text = ''.join([i[0] for i in d]) R = set(NER.recognize(text)) T = set([tuple(i) for i in d if i[1] != 'O']) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recallclass Evaluate(keras.callbacks.Callback): def __init__(self): self.best_val_f1 = 0 def on_epoch_end(self, epoch, logs=None): trans = K.eval(CRF.trans) NER.trans = trans print(NER.trans) f1, precision, recall = evaluate(valid_data) # 保存最优 if f1 >= self.best_val_f1: self.best_val_f1 = f1 model.save_weights('best_model.weights') print( 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1) ) f1, precision, recall = evaluate(test_data) print( 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % (f1, precision, recall) )if __name__ == '__main__': evaluator = Evaluate() train_generator = data_generator(train_data, batch_size) model.fit_generator( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator] )else: model.load_weights('best_model.weights')
使用训练好的模型,简例:
import numpy as npfrom bert4keras.backend import keras, Kfrom bert4keras.models import build_transformer_modelfrom bert4keras.tokenizers import Tokenizerfrom bert4keras.optimizers import Adamfrom bert4keras.snippets import sequence_padding, DataGeneratorfrom bert4keras.snippets import open, ViterbiDecoderfrom bert4keras.layers import ConditionalRandomFieldfrom keras.layers import Densefrom keras.models import Modelfrom tqdm import tqdmfrom tensorflow import ConfigProtofrom tensorflow import InteractiveSession#上述两句中的“tensorflow”原为 tensorflow.compat.v1config = ConfigProto()# config.gpu_options.per_process_gpu_memory_fraction = 0.2config.gpu_options.allow_growth = Truesession = InteractiveSession(config=config)maxlen = 256epochs = 1#10batch_size = 16bert_layers = 12learing_rate = 1e-5 # bert_layers越小,学习率应该要越大crf_lr_multiplier = 10 # 必要时扩大CRF层的学习率#1000# # bert配置# config_path = './bert_model/chinese_L-12_H-768_A-12/bert_config.json'# checkpoint_path = './bert_model/chinese_L-12_H-768_A-12/bert_model.ckpt'# dict_path = './bert_model/chinese_L-12_H-768_A-12/vocab.txt'#albert配置config_path = './bert_model/albert_large/albert_config.json'checkpoint_path = './bert_model/albert_large/model.ckpt-best'dict_path = './bert_model/albert_large/vocab_chinese.txt'def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: f = f.read() for l in f.split('\n\n'): if not l: continue d, last_flag = [], '' for c in l.split('\n'): char, this_flag = c.split(' ') if this_flag == 'O' and last_flag == 'O': d[-1][0] += char elif this_flag == 'O' and last_flag != 'O': d.append([char, 'O']) elif this_flag[:1] == 'B': d.append([char, this_flag[2:]]) else: d[-1][0] += char last_flag = this_flag D.append(d) return D# 标注数据train_data = load_data('./data/example.train')valid_data = load_data('./data/example.dev')test_data = load_data('./data/example.test')# 建立分词器tokenizer = Tokenizer(dict_path, do_lower_case=True)# 类别映射labels = ['PER', 'LOC', 'ORG']id2label = dict(enumerate(labels))label2id = {j: i for i, j in id2label.items()}num_labels = len(labels) * 2 + 1class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, item in self.sample(random): token_ids, labels = [tokenizer._token_start_id], [0] for w, l in item: w_token_ids = tokenizer.encode(w)[0][1:-1] if len(token_ids) + len(w_token_ids) < maxlen: token_ids += w_token_ids if l == 'O': labels += [0] * len(w_token_ids) else: B = label2id[l] * 2 + 1 I = label2id[l] * 2 + 2 labels += ([B] + [I] * (len(w_token_ids) - 1)) else: break token_ids += [tokenizer._token_end_id] labels += [0] segment_ids = [0] * len(token_ids) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append(labels) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []"""后面的代码使用的是bert类型的模型,如果你用的是albert,那么前几行请改为:"""model = build_transformer_model( config_path, checkpoint_path, model='albert',)output_layer = 'Transformer-FeedForward-Norm'output = model.get_layer(output_layer).get_output_at(bert_layers - 1)# model = build_transformer_model(# config_path,# checkpoint_path,# )## output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)# output = model.get_layer(output_layer).outputoutput = Dense(num_labels)(output)CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)output = CRF(output)model = Model(model.input, output)model.summary()model.compile( loss=CRF.sparse_loss, optimizer=Adam(learing_rate), metrics=[CRF.sparse_accuracy])class NamedEntityRecognizer(ViterbiDecoder): """命名实体识别器 """ def recognize(self,text): tokens = tokenizer.tokenize(text) while len(tokens) > 512: tokens.pop(-2) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) nodes = model.predict([[token_ids], [segment_ids]])[0] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], id2label[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]def evaluate(data): """评测函数 """ X, Y, Z = 1e-10, 1e-10, 1e-10 for d in tqdm(data): text = ''.join([i[0] for i in d]) R = set(NER.recognize(text)) T = set([tuple(i) for i in d if i[1] != 'O']) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recallclass Evaluate(keras.callbacks.Callback): def __init__(self): self.best_val_f1 = 0 def on_epoch_end(self, epoch, logs=None): trans = K.eval(CRF.trans) NER.trans = trans print(NER.trans) f1, precision, recall = evaluate(valid_data) # 保存最优 if f1 >= self.best_val_f1: self.best_val_f1 = f1 model.save_weights('best_model.weights') print( 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % (f1, precision, recall, self.best_val_f1) ) f1, precision, recall = evaluate(test_data) print( 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % (f1, precision, recall) )model.load_weights('best_model.weights')NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])ner=NER.recognize("我在厦门")print(ner)
小总结:多尝试,改改改 + 基础编程知识(如:类的使用)要扎实。
转载地址:https://blog.csdn.net/YWP_2016/article/details/114543741 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
不错!
[***.144.177.141]2024年04月18日 13时17分56秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
如何使用 kubeadm 安装 Kubernetes?
2019-04-29
开源技术、开放使用、业务导向的大数据平台,助力银行数字化转型
2019-04-29
【Camera专题】Sprd-深入浅出Camera驱动框架1(HAL层-Kernel层)
2019-04-29
c++ 类写法风格
2019-04-29
c++ 继承 关系
2019-04-29
c++ 派生类的构造函数要点
2019-04-29
虚函数表 图解
2019-04-29
为什么需要虚析构函数
2019-04-29
析构函数是否必须为虚函数?为何?
2019-04-29
c++ 虚基类
2019-04-29
c++ 强制类型转换
2019-04-29
三原色还原
2019-04-29
C语言typeof详解
2019-04-29
文章信息显示
2019-04-29
默认的
2019-04-29
图片方式
2019-04-29
213213213
2019-04-29
Operation not allowed
2019-04-29
java设计模式--迪米特法则
2019-04-29