bash_problem_search/app/router.py

from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from typing import List
import numpy as np
from app.dependencies import (
    model, error_memory, clean_traceback, split_traceback_layers,
    compute_layered_similarity_sco, rebuild_index,
    id_to_index, index_to_id
)
import faiss
from app.utils.response_wrapper import standard_response

router = APIRouter()
aggregate_index = None

class ErrorInsert(BaseModel):
    error: str
    db_id: str

class ErrorQuery(BaseModel):
    error: str
    top_n: int

@router.post("/insert", summary="插入数据并重新建立索引")
@standard_response
def insert_errors(errors: List[ErrorInsert]):
    global aggregate_index, id_to_index, index_to_id

    for entry in errors:
        cleaned = clean_traceback(entry.error)
        layers = split_traceback_layers(cleaned)
        layer_vectors = model.encode(layers)
        agg_vector = np.mean(layer_vectors, axis=0)

        error_memory[entry.db_id] = {
            "error": entry.error,
            "vector": agg_vector,
            "layers": layers,
            "layer_vectors": layer_vectors
        }
    aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
    return {"inserted": [e.db_id for e in errors]}

@router.get("/list", summary="获取已建立索引的所有 db_id")
@standard_response
def list_db_ids():
    return list(error_memory.keys())

@router.delete("/delete", summary="删除指定 db_id 的数据")
@standard_response
def delete_errors(ids: List[str] = Query(...)):
    global aggregate_index, id_to_index, index_to_id
    deleted = []

    for db_id in ids:
        if db_id in error_memory:
            del error_memory[db_id]
            deleted.append(db_id)

    aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
    return {"deleted": deleted}

@router.put("/update", summary="更新指定 db_id 的数据")
@standard_response
def update_error(error: ErrorInsert):
    if error.db_id not in error_memory:
        raise HTTPException(status_code=404, detail="db_id not found")

    cleaned = clean_traceback(error.error)
    layers = split_traceback_layers(cleaned)
    layer_vectors = model.encode(layers)
    agg_vector = np.mean(layer_vectors, axis=0)

    error_memory[error.db_id] = {
        "error": error.error,
        "vector": agg_vector,
        "layers": layers,
        "layer_vectors": layer_vectors
    }

    global aggregate_index, id_to_index, index_to_id
    aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
    return {"updated": error.db_id}

@router.get("/page", summary="分页获取错误信息")
@standard_response
def get_error_page(page: int = Query(1, ge=1), page_size: int = Query(10, gt=0)):
    if not error_memory:
        raise HTTPException(status_code=404, detail="数据库为空")

    total = len(error_memory)
    start = (page - 1) * page_size
    end = start + page_size

    if start >= total:
        raise HTTPException(status_code=404, detail="页码超出范围")

    paginated_errors = list(error_memory.items())[start:end]
    return {
        "total": total,
        "page": page,
        "page_size": page_size,
        "data": [{"db_id": db_id, "error": entry["error"]} for db_id, entry in paginated_errors]
    }

@router.post("/search", summary="搜索 top_n 个最相似的错误")
@standard_response
def search_error(query: ErrorQuery):
    if not error_memory:
        raise HTTPException(status_code=404, detail="数据库为空")

    cleaned = clean_traceback(query.error)
    user_layers = split_traceback_layers(cleaned)
    user_vectors = model.encode(user_layers)

    user_agg_vector = np.mean(user_vectors, axis=0).astype('float32').reshape(1, -1)
    k = min(query.top_n, len(error_memory))
    sim, indices = aggregate_index.search(user_agg_vector, k)

    results = []
    for idx, score in zip(indices[0], sim[0]):
        db_id = index_to_id[idx]
        db_entry = error_memory[db_id]

        # 分层匹配得分
        layer_score = compute_layered_similarity_sco(user_vectors, db_entry["layer_vectors"])

        # 逐层匹配，提取每一层的关键字
        matched_keywords = []
        for user_layer, db_layer in zip(user_layers, db_entry["layers"]):
            # 这里简单取两个文本的公共单词作为关键字（你可以换成更复杂的匹配方法）
            user_tokens = set(user_layer.split())
            db_tokens = set(db_layer.split())
            common_tokens = user_tokens & db_tokens
            matched_keywords.append(list(common_tokens))

        results.append({
            "db_id": db_id,
            "aggregate_similarity": round(float(score), 4),    # 聚合索引匹配得分
            "layer_similarity": round(layer_score, 4),         # 分层匹配得分
            "matched_layers": db_entry["layers"],              # 匹配到的 traceback 每一层
            "matched_keywords": matched_keywords               # 每一层匹配到的关键字列表
        })

    # 先按分层得分排，再按聚合得分排
    results.sort(key=lambda x: (x["layer_similarity"], x["aggregate_similarity"]), reverse=True)

    return results