init
This commit is contained in:
commit
c29909ad10
|
|
@ -0,0 +1 @@
|
||||||
|
/.venv/
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,71 @@
|
||||||
|
import faiss
|
||||||
|
import numpy as np
|
||||||
|
import re
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
# 加载向量模型
|
||||||
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
|
||||||
|
def clean_traceback(traceback_string: str) -> str:
|
||||||
|
cleaned = traceback_string.replace('\r', '')
|
||||||
|
cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE)
|
||||||
|
file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE)
|
||||||
|
|
||||||
|
def replace_file_line(match):
|
||||||
|
full_path = match.group(1)
|
||||||
|
line_num = match.group(2)
|
||||||
|
function_part = match.group(3) or ''
|
||||||
|
filename = full_path.split('/')[-1].split('\\')[-1]
|
||||||
|
return f' File "{filename}", line {line_num}{function_part}'
|
||||||
|
|
||||||
|
cleaned = file_line_regex.sub(replace_file_line, cleaned)
|
||||||
|
cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned)
|
||||||
|
cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip()
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
def split_traceback_layers(traceback_string: str):
|
||||||
|
return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def rebuild_index(error_memory):
|
||||||
|
vectors = []
|
||||||
|
id_to_index = {}
|
||||||
|
index_to_id = {}
|
||||||
|
|
||||||
|
for idx, (db_id, item) in enumerate(error_memory.items()):
|
||||||
|
vectors.append(item["vector"])
|
||||||
|
id_to_index[db_id] = idx
|
||||||
|
index_to_id[idx] = db_id
|
||||||
|
|
||||||
|
if not vectors:
|
||||||
|
return None, id_to_index, index_to_id
|
||||||
|
|
||||||
|
mat = np.array(vectors).astype("float32")
|
||||||
|
index = faiss.IndexFlatIP(mat.shape[1])
|
||||||
|
index.add(mat)
|
||||||
|
|
||||||
|
return index, id_to_index, index_to_id
|
||||||
|
|
||||||
|
|
||||||
|
# 分层相似度计算
|
||||||
|
def compute_layered_similarity_sco(user_vecs, db_vectors):
|
||||||
|
"""
|
||||||
|
user_vecs: np.ndarray of shape (L1, D)
|
||||||
|
db_vectors: np.ndarray of shape (L2, D)
|
||||||
|
"""
|
||||||
|
weighted_score = 0.0
|
||||||
|
layer_weights = np.logspace(0, 1, len(user_vecs)) # 层数权重,例如 [1, 2.15, ..., 10]
|
||||||
|
layer_weights /= np.sum(layer_weights) # 归一化
|
||||||
|
|
||||||
|
for i, u_vec in enumerate(user_vecs):
|
||||||
|
sims = np.dot(db_vectors, u_vec) # 对每个用户层和 DB 所有层计算 dot similarity
|
||||||
|
max_sim = float(np.max(sims)) # 取最大匹配层
|
||||||
|
weighted_score += max_sim * layer_weights[i]
|
||||||
|
|
||||||
|
return weighted_score
|
||||||
|
|
||||||
|
# 全局状态
|
||||||
|
error_memory = {} # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}}
|
||||||
|
id_to_index = {}
|
||||||
|
index_to_id = {}
|
||||||
|
current_index = 0
|
||||||
|
aggregate_index = faiss.IndexFlatIP(384) # all-MiniLM-L6-v2 输出维度
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from app.router import router
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Vector Search API",
|
||||||
|
version="1.0.0",
|
||||||
|
docs_url="/docs",
|
||||||
|
redoc_url="/redoc",
|
||||||
|
openapi_url="/openapi.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(router)
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def root():
|
||||||
|
return {"message": "Vector Search API is running"}
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
from fastapi import APIRouter, HTTPException, Query
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import List
|
||||||
|
import numpy as np
|
||||||
|
from app.dependencies import (
|
||||||
|
model, error_memory, clean_traceback, split_traceback_layers,
|
||||||
|
compute_layered_similarity_sco, rebuild_index,
|
||||||
|
id_to_index, index_to_id
|
||||||
|
)
|
||||||
|
import faiss
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
aggregate_index = None
|
||||||
|
|
||||||
|
class ErrorInsert(BaseModel):
|
||||||
|
error: str
|
||||||
|
db_id: str
|
||||||
|
|
||||||
|
class ErrorQuery(BaseModel):
|
||||||
|
error: str
|
||||||
|
top_n: int
|
||||||
|
|
||||||
|
@router.post("/insert", summary="插入数据并重新建立索引")
|
||||||
|
def insert_errors(errors: List[ErrorInsert]):
|
||||||
|
global aggregate_index, id_to_index, index_to_id
|
||||||
|
|
||||||
|
for entry in errors:
|
||||||
|
cleaned = clean_traceback(entry.error)
|
||||||
|
layers = split_traceback_layers(cleaned)
|
||||||
|
layer_vectors = model.encode(layers)
|
||||||
|
agg_vector = np.mean(layer_vectors, axis=0)
|
||||||
|
|
||||||
|
error_memory[entry.db_id] = {
|
||||||
|
"error": entry.error,
|
||||||
|
"vector": agg_vector,
|
||||||
|
"layers": layers,
|
||||||
|
"layer_vectors": layer_vectors
|
||||||
|
}
|
||||||
|
|
||||||
|
aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
|
||||||
|
return {"inserted": [e.db_id for e in errors]}
|
||||||
|
|
||||||
|
@router.get("/list", summary="获取已建立索引的所有 db_id")
|
||||||
|
def list_db_ids():
|
||||||
|
return list(error_memory.keys())
|
||||||
|
|
||||||
|
@router.delete("/delete", summary="删除指定 db_id 的数据")
|
||||||
|
def delete_errors(ids: List[str] = Query(...)):
|
||||||
|
global aggregate_index, id_to_index, index_to_id
|
||||||
|
deleted = []
|
||||||
|
|
||||||
|
for db_id in ids:
|
||||||
|
if db_id in error_memory:
|
||||||
|
del error_memory[db_id]
|
||||||
|
deleted.append(db_id)
|
||||||
|
|
||||||
|
aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
|
||||||
|
return {"deleted": deleted}
|
||||||
|
|
||||||
|
@router.put("/update", summary="更新指定 db_id 的数据")
|
||||||
|
def update_error(error: ErrorInsert):
|
||||||
|
if error.db_id not in error_memory:
|
||||||
|
raise HTTPException(status_code=404, detail="db_id not found")
|
||||||
|
|
||||||
|
cleaned = clean_traceback(error.error)
|
||||||
|
layers = split_traceback_layers(cleaned)
|
||||||
|
layer_vectors = model.encode(layers)
|
||||||
|
agg_vector = np.mean(layer_vectors, axis=0)
|
||||||
|
|
||||||
|
error_memory[error.db_id] = {
|
||||||
|
"error": error.error,
|
||||||
|
"vector": agg_vector,
|
||||||
|
"layers": layers,
|
||||||
|
"layer_vectors": layer_vectors
|
||||||
|
}
|
||||||
|
|
||||||
|
global aggregate_index, id_to_index, index_to_id
|
||||||
|
aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory)
|
||||||
|
return {"updated": error.db_id}
|
||||||
|
|
||||||
|
@router.post("/search", summary="搜索 top_n 个最相似的错误")
|
||||||
|
def search_error(query: ErrorQuery):
|
||||||
|
if not error_memory:
|
||||||
|
raise HTTPException(status_code=404, detail="数据库为空")
|
||||||
|
|
||||||
|
cleaned = clean_traceback(query.error)
|
||||||
|
user_layers = split_traceback_layers(cleaned)
|
||||||
|
user_vectors = model.encode(user_layers)
|
||||||
|
|
||||||
|
user_agg_vector = np.mean(user_vectors, axis=0).astype('float32').reshape(1, -1)
|
||||||
|
k = min(query.top_n, len(error_memory))
|
||||||
|
sim, indices = aggregate_index.search(user_agg_vector, k)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for idx, score in zip(indices[0], sim[0]):
|
||||||
|
db_id = index_to_id[idx]
|
||||||
|
db_entry = error_memory[db_id]
|
||||||
|
layer_score = compute_layered_similarity_sco(user_vectors, db_entry["layer_vectors"])
|
||||||
|
results.append({
|
||||||
|
"db_id": db_id,
|
||||||
|
"similarity": round(layer_score, 4),
|
||||||
|
"matched_layers": db_entry["layers"]
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
fastapi
|
||||||
|
uvicorn
|
||||||
|
faiss-cpu
|
||||||
|
sentence-transformers
|
||||||
|
numpy
|
||||||
|
|
||||||
Loading…
Reference in New Issue