[AI趣事]让AI成为"实体识别侦探":从聊天机器人到医学专家的NER之路 5-19 实战篇
嗨,各位AI探索者!
还记得和Siri、小爱同学对话的经历吗?当你说”明天北京的天气怎么样”时,它们不仅要理解你想查天气(意图识别),还要准确抓住”明天”和”北京”这两个关键信息。这就是今天的主角——命名实体识别(NER)的功劳!
🤖 从聊天机器人说起:AI的”理解力”
智能助手的工作原理
[插入图片1:bot-ner.png]
![[AI趣事]让AI成为"实体识别侦探":从聊天机器人到医学专家的NER之路 5-19 实战篇](https://cdn.learnku.com/uploads/images/202510/26/46135/cdoJ2CVFdq.png!large)
图:聊天机器人如何使用NER理解用户意图和参数
# 用户说话的完整流程
user_input = "明天北京的天气怎么样?"
# 步骤1:意图识别
intent_classifier = "查询天气" # 分类模型告诉我们用户想干嘛
# 步骤2:实体识别(NER的工作)
ner_result = {
"明天": "DATE", # 时间实体
"北京": "LOCATION" # 地点实体
}
# 步骤3:填充参数槽
weather_query = {
"action": "get_weather",
"date": "明天",
"location": "北京"
}
# AI知道了:用户想查询明天北京的天气!
没有NER,AI就像个”话痨”:听得懂大意,抓不住重点!
NER的应用场景
class NERApplications:
def __init__(self):
self.scenarios = {
"智能客服": {
"输入": "我要退订3月15号去上海的机票",
"识别": {"3月15号": "DATE", "上海": "LOCATION", "机票": "PRODUCT"}
},
"智能搜索": {
"输入": "苹果公司CEO库克的最新演讲",
"识别": {"苹果公司": "ORG", "库克": "PERSON"}
},
"医学分析": {
"输入": "患者服用阿司匹林后出现胃痛症状",
"识别": {"阿司匹林": "DRUG", "胃痛": "SYMPTOM"}
}
}
def demo_extraction(self, text, domain):
"""演示不同领域的实体提取"""
if domain in self.scenarios:
return self.scenarios[domain]
return "领域不支持"
# 使用示例
ner_demo = NERApplications()
medical_case = ner_demo.demo_extraction("", "医学分析")
print("医学NER示例:", medical_case)
🏷️ BIO标注:给每个词戴”身份牌”
什么是BIO标注?
还记得小时候玩的”角色扮演”游戏吗?每个人都要戴一个身份牌。BIO标注就是给文本中的每个词都戴上”身份牌”!
# 医学论文标题示例
title = "Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant."
# BIO标注就像给每个词发"工作证"
bio_explanation = {
"B-": "Begin - 实体的老大(第一个词)",
"I-": "Inside - 实体的小弟(后续词)",
"O": "Outside - 路人甲(非实体)"
}
# 标注结果
words_and_tags = [
("Tricuspid", "B-DIS"), # 疾病开始
("valve", "I-DIS"), # 疾病继续
("regurgitation", "I-DIS"), # 疾病继续
("and", "O"), # 路人甲
("lithium", "B-CHEM"), # 化学物质开始
("carbonate", "I-CHEM"), # 化学物质继续
("toxicity", "B-DIS"), # 新疾病开始
("in", "O"), # 路人甲
("a", "O"), # 路人甲
("newborn", "O"), # 路人甲
("infant", "O"), # 路人甲
(".", "O") # 路人甲
]
def visualize_bio_tagging():
"""可视化BIO标注"""
print("📝 BIO标注可视化:")
print("单词: ", " ".join([f"{word:>12}" for word, _ in words_and_tags]))
print("标签: ", " ".join([f"{tag:>12}" for _, tag in words_and_tags]))
# 实体提取结果
entities = extract_entities_from_bio(words_and_tags)
print("\n🎯 识别出的实体:")
for entity in entities:
print(f" - {entity['text']} → {entity['type']}")
def extract_entities_from_bio(tagged_words):
"""从BIO标注中提取实体"""
entities = []
current_entity = None
for word, tag in tagged_words:
if tag.startswith('B-'):
# 发现新实体
if current_entity:
entities.append(current_entity)
current_entity = {
'text': word,
'type': tag[2:] # 去掉B-前缀
}
elif tag.startswith('I-') and current_entity:
# 实体继续
current_entity['text'] += ' ' + word
else:
# 实体结束
if current_entity:
entities.append(current_entity)
current_entity = None
# 处理最后一个实体
if current_entity:
entities.append(current_entity)
return entities
# 演示
visualize_bio_tagging()
为什么需要B和I?
# 没有B/I区分的问题
confusing_example = "苹果公司和苹果手机"
bad_tagging = ["ORG", "ORG", "O", "PRODUCT", "PRODUCT"]
# 问题:分不清哪些词属于同一个实体!
# 有B/I区分的解决方案
good_tagging = ["B-ORG", "I-ORG", "O", "B-PRODUCT", "I-PRODUCT"]
# 清晰:两个不同的实体!
🧠 Token分类:Many-to-Many的艺术
RNN架构回顾
[插入图片2:unreasonable-effectiveness-of-rnn.jpg]
![[AI趣事]让AI成为"实体识别侦探":从聊天机器人到医学专家的NER之路 5-19 实战篇](https://cdn.learnku.com/uploads/images/202510/26/46135/D32mp4zon5.png!large)
图:不同的RNN架构,NER对应最右边的many-to-many模式
import torch
import torch.nn as nn
class NERModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
super().__init__()
# 词嵌入层
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# 双向LSTM(看得见前文和后文)
self.lstm = nn.LSTM(
embedding_dim, hidden_dim,
bidirectional=True,
batch_first=True
)
# 分类器:每个词都要预测一个标签
self.classifier = nn.Linear(hidden_dim * 2, num_tags)
# Dropout防止过拟合
self.dropout = nn.Dropout(0.1)
def forward(self, sentences):
# sentences: [batch_size, seq_len]
# 词嵌入
embedded = self.embedding(sentences) # [batch_size, seq_len, embedding_dim]
# LSTM处理
lstm_out, _ = self.lstm(embedded) # [batch_size, seq_len, hidden_dim*2]
# Dropout
lstm_out = self.dropout(lstm_out)
# 每个位置都分类
tag_scores = self.classifier(lstm_out) # [batch_size, seq_len, num_tags]
return tag_scores
# 模型配置
vocab_size = 10000 # 词汇表大小
embedding_dim = 100 # 词嵌入维度
hidden_dim = 128 # LSTM隐藏层维度
num_tags = 5 # BIO标签数量: O, B-DIS, I-DIS, B-CHEM, I-CHEM
model = NERModel(vocab_size, embedding_dim, hidden_dim, num_tags)
print(f"模型参数数量: {sum(p.numel() for p in model.parameters()):,}")
训练过程演示
def train_ner_model(model, train_data, epochs=10):
"""训练NER模型"""
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=-100) # 忽略padding
model.train()
for epoch in range(epochs):
total_loss = 0
correct_predictions = 0
total_predictions = 0
for batch_sentences, batch_tags in train_data:
optimizer.zero_grad()
# 前向传播
tag_scores = model(batch_sentences)
# 计算损失
loss = criterion(
tag_scores.view(-1, tag_scores.size(-1)), # 展平
batch_tags.view(-1) # 展平
)
# 反向传播
loss.backward()
# 梯度裁剪(防止梯度爆炸)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
# 统计准确率
predictions = torch.argmax(tag_scores, dim=-1)
mask = (batch_tags != -100) # 有效位置
correct_predictions += (predictions == batch_tags)[mask].sum().item()
total_predictions += mask.sum().item()
total_loss += loss.item()
avg_loss = total_loss / len(train_data)
accuracy = correct_predictions / total_predictions
print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Accuracy={accuracy:.4f}")
# 模拟训练
print("🚀 开始训练NER模型...")
# train_ner_model(model, train_loader, epochs=5)
🏥 实战任务:医学NER专家训练
BC5CDR数据集:医学实体的宝库
class BC5CDRDataProcessor:
def __init__(self):
self.entity_types = ["Disease", "Chemical"]
self.sample_data = """
6794356|t|Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant.
6794356|a|A newborn with massive tricuspid regurgitation, atrial flutter, congestive heart failure, and a high serum lithium level is described.
6794356 0 29 Tricuspid valve regurgitation Disease D014262
6794356 34 51 lithium carbonate Chemical D016651
6794356 52 60 toxicity Disease D064420
"""
def parse_bc5cdr_format(self, data_text):
"""解析BC5CDR格式数据"""
lines = data_text.strip().split('\n')
# 解析标题和摘要
title_line = lines[0].split('|')
abstract_line = lines[1].split('|')
doc_id = title_line[0]
title = title_line[2]
abstract = abstract_line[2]
# 合并全文(标题 + 空格 + 摘要)
full_text = title + " " + abstract
# 解析实体标注
entities = []
for line in lines[2:]:
if line.strip():
parts = line.split('\t')
entity = {
'start': int(parts[1]),
'end': int(parts[2]),
'text': parts[3],
'type': parts[4],
'ontology_id': parts[5]
}
entities.append(entity)
return {
'doc_id': doc_id,
'text': full_text,
'entities': entities
}
def convert_to_bio_format(self, text, entities):
"""转换为BIO格式"""
# 字符级别的标注
char_labels = ['O'] * len(text)
# 按位置排序实体(避免重叠)
sorted_entities = sorted(entities, key=lambda x: x['start'])
for entity in sorted_entities:
start, end = entity['start'], entity['end']
entity_type = entity['type']
if start < len(text) and end <= len(text):
# 设置B-标签
char_labels[start] = f"B-{entity_type}"
# 设置I-标签
for i in range(start + 1, end):
char_labels[i] = f"I-{entity_type}"
return char_labels
def tokenize_and_align(self, text, char_labels, tokenizer):
"""分词并对齐标签"""
# 这里需要处理子词对齐,实际项目中比较复杂
tokens = text.split() # 简化处理
token_labels = []
current_pos = 0
for token in tokens:
# 找到token在原文中的位置
start_pos = text.find(token, current_pos)
if start_pos != -1:
# 使用token第一个字符的标签
token_labels.append(char_labels[start_pos])
current_pos = start_pos + len(token)
else:
token_labels.append('O')
return tokens, token_labels
# 演示数据处理
processor = BC5CDRDataProcessor()
parsed_data = processor.parse_bc5cdr_format(processor.sample_data)
print("📄 BC5CDR数据解析结果:")
print(f"文档ID: {parsed_data['doc_id']}")
print(f"文本: {parsed_data['text'][:100]}...")
print(f"实体数量: {len(parsed_data['entities'])}")
for entity in parsed_data['entities']:
print(f"🔍 {entity['text']} → {entity['type']} ({entity['ontology_id']})")
# 转换为BIO格式
char_labels = processor.convert_to_bio_format(parsed_data['text'], parsed_data['entities'])
tokens, token_labels = processor.tokenize_and_align(
parsed_data['text'], char_labels, None
)
print("\n🏷️ BIO标注示例:")
for i, (token, label) in enumerate(zip(tokens[:10], token_labels[:10])):
print(f"{token:>15} → {label}")
PubMedBERT:医学领域的BERT专家
from transformers import AutoTokenizer, BertForTokenClassification
import torch
class MedicalNERWithBERT:
def __init__(self):
# Microsoft的医学专用BERT
self.model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
# 加载分词器和模型
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# 定义标签
self.label2id = {
"O": 0,
"B-Disease": 1, "I-Disease": 2,
"B-Chemical": 3, "I-Chemical": 4
}
self.id2label = {v: k for k, v in self.label2id.items()}
# 加载模型
self.model = BertForTokenClassification.from_pretrained(
self.model_name,
num_labels=len(self.label2id),
id2label=self.id2label,
label2id=self.label2id
)
def prepare_training_data(self, texts, entities_list):
"""准备训练数据"""
all_input_ids = []
all_attention_masks = []
all_labels = []
for text, entities in zip(texts, entities_list):
# 分词
encoding = self.tokenizer(
text,
padding='max_length',
truncation=True,
max_length=512,
return_tensors='pt'
)
# 对齐标签
word_ids = encoding.word_ids()
labels = self.align_labels(text, entities, word_ids)
all_input_ids.append(encoding['input_ids'])
all_attention_masks.append(encoding['attention_mask'])
all_labels.append(torch.tensor(labels))
return {
'input_ids': torch.cat(all_input_ids),
'attention_mask': torch.cat(all_attention_masks),
'labels': torch.stack(all_labels)
}
def align_labels(self, text, entities, word_ids):
"""对齐BERT分词和NER标签"""
# 先生成字符级标签
char_labels = ['O'] * len(text)
for entity in entities:
start, end = entity['start'], entity['end']
entity_type = entity['type']
if start < len(text) and end <= len(text):
char_labels[start] = f"B-{entity_type}"
for i in range(start + 1, end):
char_labels[i] = f"I-{entity_type}"
# 对齐到BERT tokens
aligned_labels = []
for word_id in word_ids:
if word_id is None:
aligned_labels.append(-100) # 特殊token
else:
# 简化处理:使用第一个字符的标签
if word_id < len(char_labels):
label = char_labels[word_id]
aligned_labels.append(self.label2id.get(label, 0))
else:
aligned_labels.append(0)
return aligned_labels
def train(self, train_data, epochs=3):
"""训练医学NER模型"""
optimizer = torch.optim.Adam(self.model.parameters(), lr=2e-5)
self.model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model.to(device)
for epoch in range(epochs):
total_loss = 0
# 简化:这里应该用DataLoader
input_ids = train_data['input_ids'].to(device)
attention_mask = train_data['attention_mask'].to(device)
labels = train_data['labels'].to(device)
optimizer.zero_grad()
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
loss = outputs.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch + 1}: Loss = {total_loss:.4f}")
def predict(self, text):
"""预测文本中的医学实体"""
self.model.eval()
# 分词
inputs = self.tokenizer(
text,
return_tensors='pt',
padding=True,
truncation=True,
max_length=512
)
# 预测
with torch.no_grad():
outputs = self.model(**inputs)
predictions = torch.argmax(outputs.logits, dim=2)
# 解码
tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
labels = [self.id2label[pred.item()] for pred in predictions[0]]
# 提取实体
entities = self.extract_entities_from_predictions(tokens, labels)
return entities
def extract_entities_from_predictions(self, tokens, labels):
"""从预测结果提取实体"""
entities = []
current_entity = None
for token, label in zip(tokens, labels):
if token in ['[CLS]', '[SEP]', '[PAD]']:
continue
if label.startswith('B-'):
if current_entity:
entities.append(current_entity)
current_entity = {
'text': token.replace('##', ''),
'type': label[2:]
}
elif label.startswith('I-') and current_entity:
current_entity['text'] += token.replace('##', '')
else:
if current_entity:
entities.append(current_entity)
current_entity = None
if current_entity:
entities.append(current_entity)
return entities
# 使用示例
medical_ner = MedicalNERWithBERT()
# 模拟医学文本
medical_text = "Patient diagnosed with diabetes mellitus and prescribed metformin."
predicted_entities = medical_ner.predict(medical_text)
print("🩺 医学NER预测结果:")
for entity in predicted_entities:
entity_type = "疾病" if entity['type'] == "Disease" else "化学物质"
print(f" - {entity['text']} → {entity_type}")
🔬 完整实验流程
数据准备与模型训练
class MedicalNERExperiment:
def __init__(self):
self.medical_ner = MedicalNERWithBERT()
self.data_processor = BC5CDRDataProcessor()
def load_bc5cdr_dataset(self, file_path):
"""加载BC5CDR数据集"""
# 这里应该读取实际的BC5CDR文件
# 为演示简化处理
sample_documents = [
{
'text': "Tricuspid valve regurgitation and lithium carbonate toxicity in a newborn infant.",
'entities': [
{'start': 0, 'end': 29, 'type': 'Disease', 'text': 'Tricuspid valve regurgitation'},
{'start': 34, 'end': 51, 'type': 'Chemical', 'text': 'lithium carbonate'},
{'start': 52, 'end': 60, 'type': 'Disease', 'text': 'toxicity'}
]
},
{
'text': "Patient with diabetes mellitus treated with insulin injections.",
'entities': [
{'start': 13, 'end': 30, 'type': 'Disease', 'text': 'diabetes mellitus'},
{'start': 44, 'end': 51, 'type': 'Chemical', 'text': 'insulin'}
]
}
]
return sample_documents
def run_experiment(self):
"""运行完整实验"""
print("🧪 开始医学NER实验...")
# 1. 加载数据
documents = self.load_bc5cdr_dataset("bc5cdr_data.txt")
print(f"📚 加载了 {len(documents)} 个医学文档")
# 2. 准备训练数据
texts = [doc['text'] for doc in documents]
entities_list = [doc['entities'] for doc in documents]
train_data = self.medical_ner.prepare_training_data(texts, entities_list)
print(f"✅ 训练数据准备完成")
# 3. 训练模型
print("🚀 开始训练PubMedBERT模型...")
self.medical_ner.train(train_data, epochs=2)
# 4. 测试模型
test_text = "Patient diagnosed with hypertension and prescribed lisinopril daily."
entities = self.medical_ner.predict(test_text)
print(f"\n🎯 测试文本: {test_text}")
print("识别出的医学实体:")
for entity in entities:
print(f" - {entity['text']} ({entity['type']})")
# 5. 模型评估
self.evaluate_model(documents)
def evaluate_model(self, test_documents):
"""评估模型性能"""
correct_entities = 0
total_predicted = 0
total_actual = 0
for doc in test_documents:
predicted_entities = self.medical_ner.predict(doc['text'])
actual_entities = doc['entities']
# 简化评估:只看实体文本匹配
predicted_texts = {e['text'].lower() for e in predicted_entities}
actual_texts = {e['text'].lower() for e in actual_entities}
correct_entities += len(predicted_texts & actual_texts)
total_predicted += len(predicted_texts)
total_actual += len(actual_texts)
precision = correct_entities / total_predicted if total_predicted > 0 else 0
recall = correct_entities / total_actual if total_actual > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
print(f"\n📊 模型评估结果:")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1 Score: {f1:.4f}")
# 运行实验
experiment = MedicalNERExperiment()
experiment.run_experiment()
🚀 部署与应用
医学NER API服务
from flask import Flask, request, jsonify
app = Flask(__name__)
# 全局加载模型
medical_ner_model = MedicalNERWithBERT()
@app.route('/api/medical-ner', methods=['POST'])
def extract_medical_entities():
"""医学实体提取API"""
try:
data = request.json
text = data.get('text', '')
if not text:
return jsonify({'error': '文本不能为空'}), 400
# 提取医学实体
entities = medical_ner_model.predict(text)
# 统计信息
disease_count = len([e for e in entities if e['type'] == 'Disease'])
chemical_count = len([e for e in entities if e['type'] == 'Chemical'])
return jsonify({
'success': True,
'text': text,
'entities': entities,
'statistics': {
'total_entities': len(entities),
'diseases': disease_count,
'chemicals': chemical_count
}
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/health', methods=['GET'])
def health_check():
return jsonify({'status': 'healthy', 'model': 'PubMedBERT-NER'})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
按你的偏好部署配置
# Python环境配置
import pymysql
pymysql.install_as_MySQLdb()
# NER专用依赖包
"""
torch>=1.9.0
transformers>=4.0.0
flask>=2.0.0
seqeval>=1.2.0 # NER专用评估库
"""
Golang批处理脚本
// 大规模医学文献NER处理
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"strings"
"time"
)
type MedicalEntity struct {
Text string `json:"text"`
Type string `json:"type"`
}
type NERResult struct {
Text string `json:"text"`
Entities []MedicalEntity `json:"entities"`
}
func processMedicalPapers(inputDir, outputFile, apiURL string) error {
files, err := ioutil.ReadDir(inputDir)
if err != nil {
return err
}
var results []NERResult
for _, file := range files {
if !strings.HasSuffix(file.Name(), ".txt") {
continue
}
// 读取医学论文
content, err := ioutil.ReadFile(inputDir + "/" + file.Name())
if err != nil {
log.Printf("读取文件失败: %v", err)
continue
}
// 调用NER API
entities, err := callMedicalNERAPI(string(content), apiURL)
if err != nil {
log.Printf("NER分析失败: %v", err)
continue
}
results = append(results, NERResult{
Text: string(content),
Entities: entities,
})
// 避免API限流
time.Sleep(100 * time.Millisecond)
}
// 保存结果
return saveResults(results, outputFile)
}
func main() {
inputDir := "./medical_papers"
outputFile := "./medical_ner_results.json"
apiURL := "http://localhost:5000/api/medical-ner"
fmt.Println("🏥 开始批量处理医学文献...")
if err := processMedicalPapers(inputDir, outputFile, apiURL); err != nil {
log.Fatal("处理失败:", err)
}
fmt.Println("✅ 医学NER批处理完成!")
}
🎉 今日收获
- NER基础概念:从聊天机器人到医学分析的应用场景
- BIO标注方法:给每个词戴上精确的”身份牌”
- Token分类模型:Many-to-Many的RNN架构应用
- 医学NER实战:BC5CDR数据集 + PubMedBERT的完整流程
- 工程部署:从实验室到生产环境的完整方案
🔮 技术进阶与展望
从基础到前沿
# NER技术演进路线
ner_evolution = {
"传统方法": "基于规则和字典匹配",
"机器学习": "CRF、SVM等特征工程方法",
"深度学习": "LSTM、BiLSTM + CRF",
"预训练时代": "BERT、RoBERTa等Transformer",
"当前前沿": "ChatGPT式的大语言模型NER"
}
# 未来发展方向
future_directions = [
"少样本学习(Few-shot Learning)",
"跨语言NER(Cross-lingual NER)",
"多模态NER(文本+图像)",
"实时流式NER处理",
"个性化领域适应"
]
今天我们从聊天机器人的基础需求出发,一路深入到专业的医学NER系统。这不仅仅是技术学习,更是让AI服务于现实世界的重要应用!
下期我们将探索问答系统(Question Answering),让AI不仅能识别实体,还能回答关于这些实体的复杂问题!
觉得有用记得点赞分享!现在你已经具备了构建专业NER系统的完整技能包!
#AI学习 #NER #命名实体识别 #PubMedBERT #医学AI #深度学习 #自然语言处理
本作品采用《CC 协议》,转载必须注明作者和本文链接
关于 LearnKu