分层索引

This commit is contained in:
Pan Qiancheng 2025-04-22 20:10:42 +08:00
parent f3965e9237
commit b4c6d5cb33
1 changed files with 47 additions and 20 deletions

View File

@ -1,8 +1,13 @@
import re
import time
import logging
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# 配置日志
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
# 初始化模型
model = SentenceTransformer('all-MiniLM-L6-v2')
@ -57,60 +62,79 @@ def split_traceback_layers(traceback_string):
lines = traceback_string.strip().split("\n")
return [line.strip() for line in lines if line.strip()]
# 构建分层向量数据库
# 计时:创建数据库
start_db_time = time.time()
# 构建分层向量数据库,增加 FAISS 索引
layered_error_db = []
for entry in error_db:
cleaned = clean_traceback(entry["error"])
layers = split_traceback_layers(cleaned)
embeddings = model.encode(layers)
index = faiss.IndexFlatIP(embeddings.shape[1]) # 构建内积索引
index.add(embeddings.astype('float32')) # 添加向量到索引中
layered_error_db.append({
"layers": layers,
"vectors": embeddings,
"vectors": embeddings, # 保留原始向量(可选)
"index": index, # 添加 FAISS 索引
"solution": entry["solution"]
})
end_db_time = time.time()
logging.info(f"创建数据库耗时:{end_db_time - start_db_time:.2f}")
# 查询错误信息
# 完全相同
user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
# user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
# 完全不同
# user_error = """"Traceback (most recent call last):\n File \"D:\\Develop\\Projects\\bash-hook\\matches\\perf_2.py\", line 94, in <module>\n D, I = index.search(query_vec, k=1)\n File \"C:\\Users\\qcqcqc\\.conda\\envs\\sentence_transformers\\lib\\site-packages\\faiss\\class_wrappers.py\", line 329, in replacement_search\n assert d == self.d\nAssertionError\n"""
# 底层不同
# user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n"""
user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n"""
# 顶层不同
# user_error = """Traceback (most recent call last):\n File \"train_model.py\", line 72, in <module>\n loss = loss_function(predictions, ground_truth)\n File \"core.py\", line 442, in _call_function\n return self._execute(*args, **kwargs)\n File \"core.py\", line 450, in _execute\n return forward_execution(*args, **kwargs)\n File \"loss_functions.py\", line 205, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
# 无错误信息
# user_error = "success."
# 计时:查询
start_query_time = time.time()
cleaned_user = clean_traceback(user_error)
user_layers = split_traceback_layers(cleaned_user)
user_vectors = model.encode(user_layers)
# 计算逐层匹配分数(平均最大匹配)
def compute_layered_similarity_avg(user_vecs, db_vecs):
score = 0.0
for u_vec in user_vecs:
sims = np.dot(db_vecs, u_vec)
score += np.max(sims)
return score / len(user_vecs) if len(user_vecs) > 0 else 0.0
def compute_layered_similarity_sco(user_vecs, db_vecs, layer_weights=None):
def compute_layered_similarity_avg(user_vecs, db_index, layer_weights=None):
score = 0.0
weighted_score = 0.0
# 默认权重(越低的层越重)
if layer_weights is None:
layer_weights = np.linspace(0.1, 1, len(user_vecs)) # 低层权重较小,高层权重较大
layer_weights = np.ones(len(user_vecs)) # 每层权重相同默认为1
for i, u_vec in enumerate(user_vecs):
sims = np.dot(db_vecs, u_vec) # 计算每一层的相似度
max_sim = np.max(sims) # 获取该层的最大相似度
u_vec = u_vec.astype('float32').reshape(1, -1)
sim, _ = db_index.search(u_vec, k=1) # 获取当前层与数据库中最相似向量的相似度
max_sim = sim[0][0]
weighted_score += max_sim * layer_weights[i] # 将相似度乘以当前层的权重
weighted_score += max_sim * layer_weights[i]
return weighted_score / np.sum(layer_weights) if len(user_vecs) > 0 else 0.0
def compute_layered_similarity_sco(user_vecs, db_index, layer_weights=None):
score = 0.0
weighted_score = 0.0
if layer_weights is None:
layer_weights = np.linspace(0.1, 1, len(user_vecs)) # 默认从浅层到深层递增权重
for i, u_vec in enumerate(user_vecs):
u_vec = u_vec.astype('float32').reshape(1, -1)
sim, _ = db_index.search(u_vec, k=1) # 获取当前层与数据库中最相似向量的相似度
max_sim = sim[0][0]
weighted_score += max_sim * layer_weights[i]
# 计算加权平均分数
return weighted_score / np.sum(layer_weights) if len(user_vecs) > 0 else 0.0
@ -118,11 +142,14 @@ def compute_layered_similarity_sco(user_vecs, db_vecs, layer_weights=None):
best_match = None
best_score = -1
for db_entry in layered_error_db:
score = compute_layered_similarity_sco(user_vectors, db_entry["vectors"])
score = compute_layered_similarity_sco(user_vectors, db_entry["index"])
if score > best_score:
best_score = score
best_match = db_entry
end_query_time = time.time()
logging.info(f"查询耗时:{end_query_time - start_query_time:.2f}")
# 输出匹配结果
print("--- 清洗后的用户错误信息 ---")
print(cleaned_user)