分层索引
This commit is contained in:
parent
f3965e9237
commit
b4c6d5cb33
|
|
@ -1,8 +1,13 @@
|
|||
import re
|
||||
import time
|
||||
import logging
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import faiss
|
||||
import numpy as np
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
||||
|
||||
# 初始化模型
|
||||
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||
|
||||
|
|
@ -57,60 +62,79 @@ def split_traceback_layers(traceback_string):
|
|||
lines = traceback_string.strip().split("\n")
|
||||
return [line.strip() for line in lines if line.strip()]
|
||||
|
||||
# 构建分层向量数据库
|
||||
# 计时:创建数据库
|
||||
start_db_time = time.time()
|
||||
# 构建分层向量数据库,增加 FAISS 索引
|
||||
layered_error_db = []
|
||||
|
||||
for entry in error_db:
|
||||
cleaned = clean_traceback(entry["error"])
|
||||
layers = split_traceback_layers(cleaned)
|
||||
embeddings = model.encode(layers)
|
||||
|
||||
index = faiss.IndexFlatIP(embeddings.shape[1]) # 构建内积索引
|
||||
index.add(embeddings.astype('float32')) # 添加向量到索引中
|
||||
|
||||
layered_error_db.append({
|
||||
"layers": layers,
|
||||
"vectors": embeddings,
|
||||
"vectors": embeddings, # 保留原始向量(可选)
|
||||
"index": index, # 添加 FAISS 索引
|
||||
"solution": entry["solution"]
|
||||
})
|
||||
|
||||
end_db_time = time.time()
|
||||
logging.info(f"创建数据库耗时:{end_db_time - start_db_time:.2f} 秒")
|
||||
# 查询错误信息
|
||||
|
||||
# 完全相同
|
||||
user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
|
||||
# user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
|
||||
# 完全不同
|
||||
# user_error = """"Traceback (most recent call last):\n File \"D:\\Develop\\Projects\\bash-hook\\matches\\perf_2.py\", line 94, in <module>\n D, I = index.search(query_vec, k=1)\n File \"C:\\Users\\qcqcqc\\.conda\\envs\\sentence_transformers\\lib\\site-packages\\faiss\\class_wrappers.py\", line 329, in replacement_search\n assert d == self.d\nAssertionError\n"""
|
||||
# 底层不同
|
||||
# user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n"""
|
||||
user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n"""
|
||||
# 顶层不同
|
||||
# user_error = """Traceback (most recent call last):\n File \"train_model.py\", line 72, in <module>\n loss = loss_function(predictions, ground_truth)\n File \"core.py\", line 442, in _call_function\n return self._execute(*args, **kwargs)\n File \"core.py\", line 450, in _execute\n return forward_execution(*args, **kwargs)\n File \"loss_functions.py\", line 205, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
|
||||
# 无错误信息
|
||||
# user_error = "success."
|
||||
|
||||
# 计时:查询
|
||||
start_query_time = time.time()
|
||||
cleaned_user = clean_traceback(user_error)
|
||||
user_layers = split_traceback_layers(cleaned_user)
|
||||
user_vectors = model.encode(user_layers)
|
||||
|
||||
# 计算逐层匹配分数(平均最大匹配)
|
||||
def compute_layered_similarity_avg(user_vecs, db_vecs):
|
||||
score = 0.0
|
||||
for u_vec in user_vecs:
|
||||
sims = np.dot(db_vecs, u_vec)
|
||||
score += np.max(sims)
|
||||
return score / len(user_vecs) if len(user_vecs) > 0 else 0.0
|
||||
|
||||
|
||||
def compute_layered_similarity_sco(user_vecs, db_vecs, layer_weights=None):
|
||||
def compute_layered_similarity_avg(user_vecs, db_index, layer_weights=None):
|
||||
score = 0.0
|
||||
weighted_score = 0.0
|
||||
|
||||
# 默认权重(越低的层越重)
|
||||
if layer_weights is None:
|
||||
layer_weights = np.linspace(0.1, 1, len(user_vecs)) # 低层权重较小,高层权重较大
|
||||
layer_weights = np.ones(len(user_vecs)) # 每层权重相同,默认为1
|
||||
|
||||
for i, u_vec in enumerate(user_vecs):
|
||||
sims = np.dot(db_vecs, u_vec) # 计算每一层的相似度
|
||||
max_sim = np.max(sims) # 获取该层的最大相似度
|
||||
u_vec = u_vec.astype('float32').reshape(1, -1)
|
||||
sim, _ = db_index.search(u_vec, k=1) # 获取当前层与数据库中最相似向量的相似度
|
||||
max_sim = sim[0][0]
|
||||
|
||||
weighted_score += max_sim * layer_weights[i] # 将相似度乘以当前层的权重
|
||||
weighted_score += max_sim * layer_weights[i]
|
||||
|
||||
return weighted_score / np.sum(layer_weights) if len(user_vecs) > 0 else 0.0
|
||||
|
||||
|
||||
def compute_layered_similarity_sco(user_vecs, db_index, layer_weights=None):
|
||||
score = 0.0
|
||||
weighted_score = 0.0
|
||||
|
||||
if layer_weights is None:
|
||||
layer_weights = np.linspace(0.1, 1, len(user_vecs)) # 默认从浅层到深层递增权重
|
||||
|
||||
for i, u_vec in enumerate(user_vecs):
|
||||
u_vec = u_vec.astype('float32').reshape(1, -1)
|
||||
sim, _ = db_index.search(u_vec, k=1) # 获取当前层与数据库中最相似向量的相似度
|
||||
max_sim = sim[0][0]
|
||||
|
||||
weighted_score += max_sim * layer_weights[i]
|
||||
|
||||
# 计算加权平均分数
|
||||
return weighted_score / np.sum(layer_weights) if len(user_vecs) > 0 else 0.0
|
||||
|
||||
|
||||
|
|
@ -118,11 +142,14 @@ def compute_layered_similarity_sco(user_vecs, db_vecs, layer_weights=None):
|
|||
best_match = None
|
||||
best_score = -1
|
||||
for db_entry in layered_error_db:
|
||||
score = compute_layered_similarity_sco(user_vectors, db_entry["vectors"])
|
||||
score = compute_layered_similarity_sco(user_vectors, db_entry["index"])
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = db_entry
|
||||
|
||||
end_query_time = time.time()
|
||||
logging.info(f"查询耗时:{end_query_time - start_query_time:.2f} 秒")
|
||||
|
||||
# 输出匹配结果
|
||||
print("--- 清洗后的用户错误信息 ---")
|
||||
print(cleaned_user)
|
||||
|
|
|
|||
Loading…
Reference in New Issue