commit c29909ad10056b7f2d2bcb5425bb8df1141a77d6 Author: QCQCQC <1220204124@zust.edu.cn> Date: Fri Apr 25 18:54:15 2025 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ef81b1e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.venv/ diff --git a/app/__pycache__/dependencies.cpython-310.pyc b/app/__pycache__/dependencies.cpython-310.pyc new file mode 100644 index 0000000..7a692a2 Binary files /dev/null and b/app/__pycache__/dependencies.cpython-310.pyc differ diff --git a/app/__pycache__/main.cpython-310.pyc b/app/__pycache__/main.cpython-310.pyc new file mode 100644 index 0000000..423f7fd Binary files /dev/null and b/app/__pycache__/main.cpython-310.pyc differ diff --git a/app/__pycache__/router.cpython-310.pyc b/app/__pycache__/router.cpython-310.pyc new file mode 100644 index 0000000..69b8169 Binary files /dev/null and b/app/__pycache__/router.cpython-310.pyc differ diff --git a/app/__pycache__/utils.cpython-310.pyc b/app/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..1ffe47c Binary files /dev/null and b/app/__pycache__/utils.cpython-310.pyc differ diff --git a/app/dependencies.py b/app/dependencies.py new file mode 100644 index 0000000..88019f5 --- /dev/null +++ b/app/dependencies.py @@ -0,0 +1,71 @@ +import faiss +import numpy as np +import re +from sentence_transformers import SentenceTransformer +# 加载向量模型 +model = SentenceTransformer('all-MiniLM-L6-v2') + +def clean_traceback(traceback_string: str) -> str: + cleaned = traceback_string.replace('\r', '') + cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE) + file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE) + + def replace_file_line(match): + full_path = match.group(1) + line_num = match.group(2) + function_part = match.group(3) or '' + filename = full_path.split('/')[-1].split('\\')[-1] + return f' File "{filename}", line {line_num}{function_part}' + + cleaned = file_line_regex.sub(replace_file_line, cleaned) + cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned) + cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip() + return cleaned + +def split_traceback_layers(traceback_string: str): + return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()] + + +def rebuild_index(error_memory): + vectors = [] + id_to_index = {} + index_to_id = {} + + for idx, (db_id, item) in enumerate(error_memory.items()): + vectors.append(item["vector"]) + id_to_index[db_id] = idx + index_to_id[idx] = db_id + + if not vectors: + return None, id_to_index, index_to_id + + mat = np.array(vectors).astype("float32") + index = faiss.IndexFlatIP(mat.shape[1]) + index.add(mat) + + return index, id_to_index, index_to_id + + +# 分层相似度计算 +def compute_layered_similarity_sco(user_vecs, db_vectors): + """ + user_vecs: np.ndarray of shape (L1, D) + db_vectors: np.ndarray of shape (L2, D) + """ + weighted_score = 0.0 + layer_weights = np.logspace(0, 1, len(user_vecs)) # 层数权重,例如 [1, 2.15, ..., 10] + layer_weights /= np.sum(layer_weights) # 归一化 + + for i, u_vec in enumerate(user_vecs): + sims = np.dot(db_vectors, u_vec) # 对每个用户层和 DB 所有层计算 dot similarity + max_sim = float(np.max(sims)) # 取最大匹配层 + weighted_score += max_sim * layer_weights[i] + + return weighted_score + +# 全局状态 +error_memory = {} # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}} +id_to_index = {} +index_to_id = {} +current_index = 0 +aggregate_index = faiss.IndexFlatIP(384) # all-MiniLM-L6-v2 输出维度 \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..521e565 --- /dev/null +++ b/app/main.py @@ -0,0 +1,16 @@ +from fastapi import FastAPI +from app.router import router + +app = FastAPI( + title="Vector Search API", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +app.include_router(router) + +@app.get("/") +def root(): + return {"message": "Vector Search API is running"} diff --git a/app/router.py b/app/router.py new file mode 100644 index 0000000..740abd2 --- /dev/null +++ b/app/router.py @@ -0,0 +1,105 @@ +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List +import numpy as np +from app.dependencies import ( + model, error_memory, clean_traceback, split_traceback_layers, + compute_layered_similarity_sco, rebuild_index, + id_to_index, index_to_id +) +import faiss + +router = APIRouter() +aggregate_index = None + +class ErrorInsert(BaseModel): + error: str + db_id: str + +class ErrorQuery(BaseModel): + error: str + top_n: int + +@router.post("/insert", summary="插入数据并重新建立索引") +def insert_errors(errors: List[ErrorInsert]): + global aggregate_index, id_to_index, index_to_id + + for entry in errors: + cleaned = clean_traceback(entry.error) + layers = split_traceback_layers(cleaned) + layer_vectors = model.encode(layers) + agg_vector = np.mean(layer_vectors, axis=0) + + error_memory[entry.db_id] = { + "error": entry.error, + "vector": agg_vector, + "layers": layers, + "layer_vectors": layer_vectors + } + + aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory) + return {"inserted": [e.db_id for e in errors]} + +@router.get("/list", summary="获取已建立索引的所有 db_id") +def list_db_ids(): + return list(error_memory.keys()) + +@router.delete("/delete", summary="删除指定 db_id 的数据") +def delete_errors(ids: List[str] = Query(...)): + global aggregate_index, id_to_index, index_to_id + deleted = [] + + for db_id in ids: + if db_id in error_memory: + del error_memory[db_id] + deleted.append(db_id) + + aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory) + return {"deleted": deleted} + +@router.put("/update", summary="更新指定 db_id 的数据") +def update_error(error: ErrorInsert): + if error.db_id not in error_memory: + raise HTTPException(status_code=404, detail="db_id not found") + + cleaned = clean_traceback(error.error) + layers = split_traceback_layers(cleaned) + layer_vectors = model.encode(layers) + agg_vector = np.mean(layer_vectors, axis=0) + + error_memory[error.db_id] = { + "error": error.error, + "vector": agg_vector, + "layers": layers, + "layer_vectors": layer_vectors + } + + global aggregate_index, id_to_index, index_to_id + aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory) + return {"updated": error.db_id} + +@router.post("/search", summary="搜索 top_n 个最相似的错误") +def search_error(query: ErrorQuery): + if not error_memory: + raise HTTPException(status_code=404, detail="数据库为空") + + cleaned = clean_traceback(query.error) + user_layers = split_traceback_layers(cleaned) + user_vectors = model.encode(user_layers) + + user_agg_vector = np.mean(user_vectors, axis=0).astype('float32').reshape(1, -1) + k = min(query.top_n, len(error_memory)) + sim, indices = aggregate_index.search(user_agg_vector, k) + + results = [] + for idx, score in zip(indices[0], sim[0]): + db_id = index_to_id[idx] + db_entry = error_memory[db_id] + layer_score = compute_layered_similarity_sco(user_vectors, db_entry["layer_vectors"]) + results.append({ + "db_id": db_id, + "similarity": round(layer_score, 4), + "matched_layers": db_entry["layers"] + }) + + return results diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..26fda00 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +fastapi +uvicorn +faiss-cpu +sentence-transformers +numpy + diff --git a/run.py b/run.py new file mode 100644 index 0000000..4753965 --- /dev/null +++ b/run.py @@ -0,0 +1,4 @@ +import uvicorn + +if __name__ == "__main__": + uvicorn.run("app.main:app", host="127.0.0.1", port=8000, reload=True)