71 lines
2.4 KiB
Python
71 lines
2.4 KiB
Python
import faiss
|
|
import numpy as np
|
|
import re
|
|
from sentence_transformers import SentenceTransformer
|
|
# 加载向量模型
|
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
def clean_traceback(traceback_string: str) -> str:
|
|
cleaned = traceback_string.replace('\r', '')
|
|
cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE)
|
|
file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE)
|
|
|
|
def replace_file_line(match):
|
|
full_path = match.group(1)
|
|
line_num = match.group(2)
|
|
function_part = match.group(3) or ''
|
|
filename = full_path.split('/')[-1].split('\\')[-1]
|
|
return f' File "{filename}", line {line_num}{function_part}'
|
|
|
|
cleaned = file_line_regex.sub(replace_file_line, cleaned)
|
|
cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned)
|
|
cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip()
|
|
return cleaned
|
|
|
|
def split_traceback_layers(traceback_string: str):
|
|
return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()]
|
|
|
|
|
|
def rebuild_index(error_memory):
|
|
vectors = []
|
|
id_to_index = {}
|
|
index_to_id = {}
|
|
|
|
for idx, (db_id, item) in enumerate(error_memory.items()):
|
|
vectors.append(item["vector"])
|
|
id_to_index[db_id] = idx
|
|
index_to_id[idx] = db_id
|
|
|
|
if not vectors:
|
|
return None, id_to_index, index_to_id
|
|
|
|
mat = np.array(vectors).astype("float32")
|
|
index = faiss.IndexFlatIP(mat.shape[1])
|
|
index.add(mat)
|
|
|
|
return index, id_to_index, index_to_id
|
|
|
|
|
|
# 分层相似度计算
|
|
def compute_layered_similarity_sco(user_vecs, db_vectors):
|
|
"""
|
|
user_vecs: np.ndarray of shape (L1, D)
|
|
db_vectors: np.ndarray of shape (L2, D)
|
|
"""
|
|
weighted_score = 0.0
|
|
layer_weights = np.logspace(0, 1, len(user_vecs)) # 层数权重,例如 [1, 2.15, ..., 10]
|
|
layer_weights /= np.sum(layer_weights) # 归一化
|
|
|
|
for i, u_vec in enumerate(user_vecs):
|
|
sims = np.dot(db_vectors, u_vec) # 对每个用户层和 DB 所有层计算 dot similarity
|
|
max_sim = float(np.max(sims)) # 取最大匹配层
|
|
weighted_score += max_sim * layer_weights[i]
|
|
|
|
return weighted_score
|
|
|
|
# 全局状态
|
|
error_memory = {} # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}}
|
|
id_to_index = {}
|
|
index_to_id = {}
|
|
current_index = 0
|
|
aggregate_index = faiss.IndexFlatIP(384) # all-MiniLM-L6-v2 输出维度 |