bash_problem_search/app/dependencies.py

71 lines
2.4 KiB
Python

import faiss
import numpy as np
import re
from sentence_transformers import SentenceTransformer
# 加载向量模型
model = SentenceTransformer('all-MiniLM-L6-v2')
def clean_traceback(traceback_string: str) -> str:
cleaned = traceback_string.replace('\r', '')
cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE)
file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE)
def replace_file_line(match):
full_path = match.group(1)
line_num = match.group(2)
function_part = match.group(3) or ''
filename = full_path.split('/')[-1].split('\\')[-1]
return f' File "{filename}", line {line_num}{function_part}'
cleaned = file_line_regex.sub(replace_file_line, cleaned)
cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned)
cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip()
return cleaned
def split_traceback_layers(traceback_string: str):
return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()]
def rebuild_index(error_memory):
vectors = []
id_to_index = {}
index_to_id = {}
for idx, (db_id, item) in enumerate(error_memory.items()):
vectors.append(item["vector"])
id_to_index[db_id] = idx
index_to_id[idx] = db_id
if not vectors:
return None, id_to_index, index_to_id
mat = np.array(vectors).astype("float32")
index = faiss.IndexFlatIP(mat.shape[1])
index.add(mat)
return index, id_to_index, index_to_id
# 分层相似度计算
def compute_layered_similarity_sco(user_vecs, db_vectors):
"""
user_vecs: np.ndarray of shape (L1, D)
db_vectors: np.ndarray of shape (L2, D)
"""
weighted_score = 0.0
layer_weights = np.logspace(0, 1, len(user_vecs)) # 层数权重,例如 [1, 2.15, ..., 10]
layer_weights /= np.sum(layer_weights) # 归一化
for i, u_vec in enumerate(user_vecs):
sims = np.dot(db_vectors, u_vec) # 对每个用户层和 DB 所有层计算 dot similarity
max_sim = float(np.max(sims)) # 取最大匹配层
weighted_score += max_sim * layer_weights[i]
return weighted_score
# 全局状态
error_memory = {} # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}}
id_to_index = {}
index_to_id = {}
current_index = 0
aggregate_index = faiss.IndexFlatIP(384) # all-MiniLM-L6-v2 输出维度