commit c29909ad10056b7f2d2bcb5425bb8df1141a77d6
Author: QCQCQC <1220204124@zust.edu.cn>
Date:   Fri Apr 25 18:54:15 2025 +0800

    init

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ef81b1e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/.venv/
diff --git a/app/__pycache__/dependencies.cpython-310.pyc b/app/__pycache__/dependencies.cpython-310.pyc
new file mode 100644
index 0000000..7a692a2
Binary files /dev/null and b/app/__pycache__/dependencies.cpython-310.pyc differ
diff --git a/app/__pycache__/main.cpython-310.pyc b/app/__pycache__/main.cpython-310.pyc
new file mode 100644
index 0000000..423f7fd
Binary files /dev/null and b/app/__pycache__/main.cpython-310.pyc differ
diff --git a/app/__pycache__/router.cpython-310.pyc b/app/__pycache__/router.cpython-310.pyc
new file mode 100644
index 0000000..69b8169
Binary files /dev/null and b/app/__pycache__/router.cpython-310.pyc differ
diff --git a/app/__pycache__/utils.cpython-310.pyc b/app/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000..1ffe47c
Binary files /dev/null and b/app/__pycache__/utils.cpython-310.pyc differ
diff --git a/app/dependencies.py b/app/dependencies.py
new file mode 100644
index 0000000..88019f5
--- /dev/null
+++ b/app/dependencies.py
@@ -0,0 +1,71 @@
+import faiss
+import numpy as np
+import re
+from sentence_transformers import SentenceTransformer
+# 加载向量模型
+model = SentenceTransformer('all-MiniLM-L6-v2')
+
+def clean_traceback(traceback_string: str) -> str:
+    cleaned = traceback_string.replace('\r', '')
+    cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE)
+    file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE)
+
+    def replace_file_line(match):
+        full_path = match.group(1)
+        line_num = match.group(2)
+        function_part = match.group(3) or ''
+        filename = full_path.split('/')[-1].split('\\')[-1]
+        return f' File "{filename}", line {line_num}{function_part}'
+
+    cleaned = file_line_regex.sub(replace_file_line, cleaned)
+    cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned)
+    cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip()
+    return cleaned
+
+def split_traceback_layers(traceback_string: str):
+    return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()]
+
+
+def rebuild_index(error_memory):
+    vectors = []
+    id_to_index = {}
+    index_to_id = {}
+
+    for idx, (db_id, item) in enumerate(error_memory.items()):
+        vectors.append(item["vector"])
+        id_to_index[db_id] = idx
+        index_to_id[idx] = db_id
+
+    if not vectors:
+        return None, id_to_index, index_to_id
+
+    mat = np.array(vectors).astype("float32")
+    index = faiss.IndexFlatIP(mat.shape[1])
+    index.add(mat)
+
+    return index, id_to_index, index_to_id
+
+
+# 分层相似度计算
+def compute_layered_similarity_sco(user_vecs, db_vectors):
+    """
+    user_vecs: np.ndarray of shape (L1, D)
+    db_vectors: np.ndarray of shape (L2, D)
+    """
+    weighted_score = 0.0
+    layer_weights = np.logspace(0, 1, len(user_vecs))  # 层数权重，例如 [1, 2.15, ..., 10]
+    layer_weights /= np.sum(layer_weights)  # 归一化
+
+    for i, u_vec in enumerate(user_vecs):
+        sims = np.dot(db_vectors, u_vec)  # 对每个用户层和 DB 所有层计算 dot similarity
+        max_sim = float(np.max(sims))  # 取最大匹配层
+        weighted_score += max_sim * layer_weights[i]
+
+    return weighted_score
+
+# 全局状态
+error_memory = {}  # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}}
+id_to_index = {}
+index_to_id = {}
+current_index = 0
+aggregate_index = faiss.IndexFlatIP(384)  # all-MiniLM-L6-v2 输出维度
\ No newline at end of file
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000..521e565
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,16 @@
+from fastapi import FastAPI
+from app.router import router
+
+app = FastAPI(
+    title="Vector Search API",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    openapi_url="/openapi.json"
+)
+
+app.include_router(router)
+
+@app.get("/")
+def root():
+    return {"message": "Vector Search API is running"}
diff --git a/app/router.py b/app/router.py
new file mode 100644
index 0000000..740abd2
--- /dev/null
+++ b/app/router.py
@@ -0,0 +1,105 @@
+from fastapi import APIRouter, HTTPException, Query
+from pydantic import BaseModel
+from typing import List
+import numpy as np
+from app.dependencies import (
+    model, error_memory, clean_traceback, split_traceback_layers,
+    compute_layered_similarity_sco, rebuild_index,
+    id_to_index, index_to_id
+)
+import faiss
+
+router = APIRouter()
+aggregate_index = None
+
+class ErrorInsert(BaseModel):
+    error: str
+    db_id: str
+
+class ErrorQuery(BaseModel):
+    error: str
+    top_n: int
+
+@router.post("/insert", summary="插入数据并重新建立索引")
+def insert_errors(errors: List[ErrorInsert]):
+    global aggregate_index, id_to_index, index_to_id
+
+    for entry in errors:
+        cleaned = clean_traceback(entry.error)
+        layers = split_traceback_layers(cleaned)
+        layer_vectors = model.encode(layers)
+        agg_vector = np.mean(layer_vectors, axis=0)
+
+        error_memory[entry.db_id] = {
+            "error": entry.error,
+            "vector": agg_vector,
+            "layers": layers,
+            "layer_vectors": layer_vectors
+        }
+
+    aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
+    return {"inserted": [e.db_id for e in errors]}
+
+@router.get("/list", summary="获取已建立索引的所有 db_id")
+def list_db_ids():
+    return list(error_memory.keys())
+
+@router.delete("/delete", summary="删除指定 db_id 的数据")
+def delete_errors(ids: List[str] = Query(...)):
+    global aggregate_index, id_to_index, index_to_id
+    deleted = []
+
+    for db_id in ids:
+        if db_id in error_memory:
+            del error_memory[db_id]
+            deleted.append(db_id)
+
+    aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
+    return {"deleted": deleted}
+
+@router.put("/update", summary="更新指定 db_id 的数据")
+def update_error(error: ErrorInsert):
+    if error.db_id not in error_memory:
+        raise HTTPException(status_code=404, detail="db_id not found")
+
+    cleaned = clean_traceback(error.error)
+    layers = split_traceback_layers(cleaned)
+    layer_vectors = model.encode(layers)
+    agg_vector = np.mean(layer_vectors, axis=0)
+
+    error_memory[error.db_id] = {
+        "error": error.error,
+        "vector": agg_vector,
+        "layers": layers,
+        "layer_vectors": layer_vectors
+    }
+
+    global aggregate_index, id_to_index, index_to_id
+    aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
+    return {"updated": error.db_id}
+
+@router.post("/search", summary="搜索 top_n 个最相似的错误")
+def search_error(query: ErrorQuery):
+    if not error_memory:
+        raise HTTPException(status_code=404, detail="数据库为空")
+
+    cleaned = clean_traceback(query.error)
+    user_layers = split_traceback_layers(cleaned)
+    user_vectors = model.encode(user_layers)
+
+    user_agg_vector = np.mean(user_vectors, axis=0).astype('float32').reshape(1, -1)
+    k = min(query.top_n, len(error_memory))
+    sim, indices = aggregate_index.search(user_agg_vector, k)
+
+    results = []
+    for idx, score in zip(indices[0], sim[0]):
+        db_id = index_to_id[idx]
+        db_entry = error_memory[db_id]
+        layer_score = compute_layered_similarity_sco(user_vectors, db_entry["layer_vectors"])
+        results.append({
+            "db_id": db_id,
+            "similarity": round(layer_score, 4),
+            "matched_layers": db_entry["layers"]
+        })
+
+    return results
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..26fda00
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+fastapi
+uvicorn
+faiss-cpu
+sentence-transformers
+numpy
+
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..4753965
--- /dev/null
+++ b/run.py
@@ -0,0 +1,4 @@
+import uvicorn
+
+if __name__ == "__main__":
+    uvicorn.run("app.main:app", host="127.0.0.1", port=8000, reload=True)