bash_problem_search/app/dependencies.py

import faiss
import numpy as np
import re
from sentence_transformers import SentenceTransformer
# 加载向量模型
model = SentenceTransformer('all-MiniLM-L6-v2')

def clean_traceback(traceback_string: str) -> str:
    cleaned = traceback_string.replace('\r', '')
    cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE)
    file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE)

    def replace_file_line(match):
        full_path = match.group(1)
        line_num = match.group(2)
        function_part = match.group(3) or ''
        filename = full_path.split('/')[-1].split('\\')[-1]
        return f' File "{filename}", line {line_num}{function_part}'

    cleaned = file_line_regex.sub(replace_file_line, cleaned)
    cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned)
    cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip()
    return cleaned

def split_traceback_layers(traceback_string: str):
    return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()]


def rebuild_index(error_memory):
    vectors = []
    id_to_index = {}
    index_to_id = {}

    for idx, (db_id, item) in enumerate(error_memory.items()):
        vectors.append(item["vector"])
        id_to_index[db_id] = idx
        index_to_id[idx] = db_id

    if not vectors:
        return None, id_to_index, index_to_id

    mat = np.array(vectors).astype("float32")
    index = faiss.IndexFlatIP(mat.shape[1])
    index.add(mat)

    return index, id_to_index, index_to_id


# 分层相似度计算
def compute_layered_similarity_sco(user_vecs, db_vectors):
    """
    user_vecs: np.ndarray of shape (L1, D)
    db_vectors: np.ndarray of shape (L2, D)
    """
    weighted_score = 0.0
    layer_weights = np.logspace(0, 1, len(user_vecs))  # 层数权重，例如 [1, 2.15, ..., 10]
    layer_weights /= np.sum(layer_weights)  # 归一化

    for i, u_vec in enumerate(user_vecs):
        sims = np.dot(db_vectors, u_vec)  # 对每个用户层和 DB 所有层计算 dot similarity
        max_sim = float(np.max(sims))  # 取最大匹配层
        weighted_score += max_sim * layer_weights[i]

    return weighted_score

# 全局状态
error_memory = {}  # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}}
id_to_index = {}
index_to_id = {}
current_index = 0
aggregate_index = faiss.IndexFlatIP(384)  # all-MiniLM-L6-v2 输出维度