From c29909ad10056b7f2d2bcb5425bb8df1141a77d6 Mon Sep 17 00:00:00 2001 From: QCQCQC <1220204124@zust.edu.cn> Date: Fri, 25 Apr 2025 18:54:15 +0800 Subject: [PATCH] init --- .gitignore | 1 + app/__pycache__/dependencies.cpython-310.pyc | Bin 0 -> 2436 bytes app/__pycache__/main.cpython-310.pyc | Bin 0 -> 564 bytes app/__pycache__/router.cpython-310.pyc | Bin 0 -> 3405 bytes app/__pycache__/utils.cpython-310.pyc | Bin 0 -> 2143 bytes app/dependencies.py | 71 +++++++++++++ app/main.py | 16 +++ app/router.py | 105 +++++++++++++++++++ requirements.txt | 6 ++ run.py | 4 + 10 files changed, 203 insertions(+) create mode 100644 .gitignore create mode 100644 app/__pycache__/dependencies.cpython-310.pyc create mode 100644 app/__pycache__/main.cpython-310.pyc create mode 100644 app/__pycache__/router.cpython-310.pyc create mode 100644 app/__pycache__/utils.cpython-310.pyc create mode 100644 app/dependencies.py create mode 100644 app/main.py create mode 100644 app/router.py create mode 100644 requirements.txt create mode 100644 run.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ef81b1e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.venv/ diff --git a/app/__pycache__/dependencies.cpython-310.pyc b/app/__pycache__/dependencies.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a692a2d1b635e79e65f13b3c2d66471b0aae6f4 GIT binary patch literal 2436 zcmZ`*OK;mo5Z+yqA|*?vgp>JX9))qZ|ERJ5W#iRWUQcDv630R7g|AW>#+|$*MLLjii=w8B zhVV+zzk{yYFl1y5AUT_`5l!iH#x>3|lr*mYqT||F=LXK<1S_DCk(RMB%e0J6Z;cC( zFw=_j458v|Y)r_AW(cQx@RrtcGF*EGD5hrG-fAyVsVzkd0NO1;WJiHYJ?Gw2FFsla z=tesVI%?=Vv)3Mmk+8ixtIf6hj(2Cnj>1^js~%r>RyXW0wsHObr`NC=01x{iVm-LR zI-y__qGlkmLeeI*U*|gDqGQU93FV~~0+KnEMmo1L{SD#ejJ+Y5_J;gs0H(^pIlA_o z;h@1F(X(&q=jVSxdrQ5cX%}YS8CsVXJ1jRkG8y!9L-nIDb&TBT1!=3Bm)nCV^7}#B z%`F7y$Ag|*YY*a98YVGp$TYXw;DtEoiQkF5g>OCBz31(US0YOK-eZ|O7p+uz$ARiL zx=HfFdj<1EHjoq|=3db6dtCHE1UiL6HT$ReycGQiBI2WJA0MuPVueXkp%pqym+37! zTx&%li2W;yHMgS#0;YDF|GGY0+HS7dL27TFJl$-36Exc2HXa_W?+lIYX0y37w7l3; zYhJvb(~48d%?kxNlOosD;5aWo`eFac-u~Xf_qo|hdVR19oCTEbJGw+S!!lYUxI1@c zv@bfHvR1DB2}m<(ewArevyzrV7V>~LzZL*<8X4*0>E zo4JmGxCx}P9fc}Im+X81M-@JD8&awu(!1FBQE)1xdKaR1*W~-f6U9q%4rN++^yW)T z|E+T6^3`Q3KLPm~WGJRZwiUkm873Xb{g8|pbS*`n(Fyo!mmDp6Y@|asMkR0q%SxQC zkO`$E(=(PC+axofw&>gBWi_M9;RIH-wsuHTZLC9Gp<|=ST{A6>&CGmmaSiW}tjrph zGss!cWL@5Y$mFGI9^_^_N`mx@Er}FxDtTEzjtLp0K(H_sJ(ZUrMIb$L7DIvtQU<56 zq|(zqoM;E3Qu)jtWY5XNC`k7n11-A&Y-s`KPC2g%DHG}UL@$x2t~q@aZiL)V6F}SDjU8 zoz+3AF!?oXw~!>44Pjpue-0B41u#V-L)lNE6rN!T*D?s#7{&?x5JRWqI)rTn!!|Nt z&&X(IKqg%j=7{kt3Xu2UQZmt1)7|ITh*o$XwrlbEg3&|1;~8R}@#QGF5jR5S8Th$9_U?Ipna%>>=Ld7D_1%T>>p71Ohz>LvTFPT_?6=BzvL0CfEFj z=GedFYfk+O$t9!Rlmt3L!<&)b@Ma`E9uFAV{&Kkb&KY}j!FvlS_)Ob=BL@t4!AdSU zcUTmn^rXiba`2!BJ_Ha#bnnX^_I~^5JX?>HdH{BjwrP&QC zNYbiG$~0GT^{eX{t?P-?;zUKlxW_u1MKb(JQ=`U>%gX_A;_dK^M~Tj(gq-9mD;mJd s?Mde?vECGTleR^DM~Oa`8kzz>(Jx?jm)wYrxDbN>Tfy@nio}G!08ZqLzyJUM literal 0 HcmV?d00001 diff --git a/app/__pycache__/router.cpython-310.pyc b/app/__pycache__/router.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69b816929dc32f86288548ac801ab00e42533d68 GIT binary patch literal 3405 zcmb7G-ESOM6`y1k=hDXG!1#?kGQYI>-w2DR0QYTar_bG0cJIK=bn4# z&OPV%`<-)J<#N%0tGrZN{;X^m|G>`C%ZAR2aAy~YVK9S{#7Id%QX0_I3`~MHnpmkF z*!tN_oYW0m{cI&(P=IkeDW;{Ml=^|6mV1Z^%KYuJ3>kFjZmPofu3!6Ut0TEkcU(9KB5Fya;!878-5@gc$+-nZZqNF*29`(>a5g5Clt=Uc%j9Yi3+X zWouWI&2EM+;ph3s?DVp0wHv*F|fEH)OVOGs>1PF1K2@7H&hoB`#vLc#|ze?e>Duc+9rf zL={%S`=1UpMZ+UL(YF8FZ5!M_bi1hC*7%AVtgLL=YKP7Fx+6xRE5@)H$L0h!6VNC# zZpuTi@~2ZcdKw!2!sO%Lx_c?sA??h7Y52zPcYg8y@U4%B_deYD?5}&T-5b9B@y_2K z?Ed*zyPy1V=iytxDmUw-X#@mvGzM7%w0v-(?|AwCoutvxbp(HvP}Xjx)W#HmX258%;Lb44 zOYo&v?H+;M7{8;d^hVB@IuRo0geLY4_cY;p=yYo3Br689!Ahnn>uG zG$a|&El9fnHJOKWXyNMGe`Vd`YuS-KYej1SN;oOh>t|nuWi$dl70=-Y-Aj-bx#XQG z=ksTJ?dV_QDuw(!`knNQoFkBk94KWI= zr{4kQ4@j485n`-b5=JzO&B%&u27J=orSrj2$ZJyIvSqG zMq_`F!i4s^oUgUduS+m3!`xw+9`%R=fYG}ZcoN?LBSfLoW+rXmkd&u&IhcNQ z|AXP?-QD-@KYI8OXuJ33JD-2_k+N6uTyP9BOa+)qG-Q8=2We!UBuBua5MEP*W$lAy z51f6l+!gOJSc3v{w#cB!T;^>7bg~5R`Lc}s?n54zl|dDtw72L2?KwRc^x?@`*JakE z(RHxxQ&7tcj23!DInpgm8od%IrUd`K9EA$tGqAy0<*qjv>z3qrx6F!t3U-*o{7tAH zpsDY6JxB@VdGSr~rh_GTzkogX!=w8TcDEmhZ-MO)`@mgHT9JI=`8_Nbd;ft;=f6Fa zvxcp1YCZ(T!;=o2G9%eohSN`8M(QW>%=SP#@L-fv0GV z)QCk-LxJ$HKtOTOx#!=OBd)>J`g!FpLR6x5EU+vJ9ZJMNdF?e8HD%lom|T@$kZKv`jd8<= hlc7g05YKj~MQXHW`PMk`$BM)^r-<*@h(A6~{tNIzQKbL? literal 0 HcmV?d00001 diff --git a/app/__pycache__/utils.cpython-310.pyc b/app/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ffe47c276cbb045b981496f940a427aee32462f GIT binary patch literal 2143 zcmZuyO>f&q5ZzspA|=a;lP0cRq=DeJsAI5)+DK zc2}t-P^SdF_0V%rKsxp>?X@TWg!ZV|N_1^cb&j7qklZc=ZkGxV*-f zVDvaWB!2xAP9z6rMa!rwj-&32P-z*beN&M_k7U}VxZwSVI4aP7fTnuT6=VizIGeL6 z)$}>zfTBRj;`VPg+GcihzvBVT+a|6YBOg}rnD$g9=xN= zhD^4eJvoEf3^s?E(oHG4BGt`qlqAh0QabSOJq^;MZJ!#amqdLv@t-xf9>$4i1{<5L zt^0nkal4tssc3EneB0l=-Hg*F+V6jU;jIol58ENW_2B%ia|)R7)eIz7O4gj_E8K>6 z(HZ5=obt*#0m)rRQ=5B*{g%KR*;`UrZ^<7Hc)oUUimp9n*jdm?yzL$R>hy2;-qB#< zHcQwW6YmVM&y3TT*(f)T%9B|8j&X*O?hZ_~H%gK)kMzKJ__#0~4Li%dQQFmUmcoS8 z#_NF>(r76DB=RG?>+asYU{AafNtOqXW%gWjwF-_SHP{(s*^A&6=$YI>QizlXQJx1Q z9Ve=lkIhvn@&p(Y;+Y}dhzwTP)o6`2=o-CFCtKY_L}_@=tyVk9AW~|#^`GsN)pl#E z8R_Polc#rf?nXPk2Rjdsws$8^yVYv#PP`xu)K-vg8(Q-##(jT)VNw`NjgC$Ar=K4^ z*?+Wu@S}0N*)RuO;3eR4?%NWb3}!?lc<0JF=unEjIFUDCx^&5+yN;``T*8jgQ7h2Z z;bjf#vAWx!*Qib9Eihi74#pKzrA(7G5HLmFl*uwjVgO+V{xOdcdak)PXstpoIekrJ^gKxkY78U(=~uu)>8> zW3Q9zwvuEr44Ls&=ppP9(c**4$NGjR}~CM zts$TDQxhl(yT(`W<&9&Gx+&Hfpm2RV&OjbxP|+tUg6EE1r$a&$vfKv(J+;T(~Mg>_M5L4 z0jbF^0i|(bEr!ZepaMZ5lAo2fNXlpoHl@coWO^@(l`>2FDHkUXlSuDBhU_0iFl9xY z`&CmHQf4w7ieV str: + cleaned = traceback_string.replace('\r', '') + cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE) + file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE) + + def replace_file_line(match): + full_path = match.group(1) + line_num = match.group(2) + function_part = match.group(3) or '' + filename = full_path.split('/')[-1].split('\\')[-1] + return f' File "{filename}", line {line_num}{function_part}' + + cleaned = file_line_regex.sub(replace_file_line, cleaned) + cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned) + cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip() + return cleaned + +def split_traceback_layers(traceback_string: str): + return [line.strip() for line in traceback_string.strip().split('\n') if line.strip()] + + +def rebuild_index(error_memory): + vectors = [] + id_to_index = {} + index_to_id = {} + + for idx, (db_id, item) in enumerate(error_memory.items()): + vectors.append(item["vector"]) + id_to_index[db_id] = idx + index_to_id[idx] = db_id + + if not vectors: + return None, id_to_index, index_to_id + + mat = np.array(vectors).astype("float32") + index = faiss.IndexFlatIP(mat.shape[1]) + index.add(mat) + + return index, id_to_index, index_to_id + + +# 分层相似度计算 +def compute_layered_similarity_sco(user_vecs, db_vectors): + """ + user_vecs: np.ndarray of shape (L1, D) + db_vectors: np.ndarray of shape (L2, D) + """ + weighted_score = 0.0 + layer_weights = np.logspace(0, 1, len(user_vecs)) # 层数权重,例如 [1, 2.15, ..., 10] + layer_weights /= np.sum(layer_weights) # 归一化 + + for i, u_vec in enumerate(user_vecs): + sims = np.dot(db_vectors, u_vec) # 对每个用户层和 DB 所有层计算 dot similarity + max_sim = float(np.max(sims)) # 取最大匹配层 + weighted_score += max_sim * layer_weights[i] + + return weighted_score + +# 全局状态 +error_memory = {} # {db_id: {"error": str, "vector": np.array, "index": FAISS index, ...}} +id_to_index = {} +index_to_id = {} +current_index = 0 +aggregate_index = faiss.IndexFlatIP(384) # all-MiniLM-L6-v2 输出维度 \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..521e565 --- /dev/null +++ b/app/main.py @@ -0,0 +1,16 @@ +from fastapi import FastAPI +from app.router import router + +app = FastAPI( + title="Vector Search API", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json" +) + +app.include_router(router) + +@app.get("/") +def root(): + return {"message": "Vector Search API is running"} diff --git a/app/router.py b/app/router.py new file mode 100644 index 0000000..740abd2 --- /dev/null +++ b/app/router.py @@ -0,0 +1,105 @@ +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from typing import List +import numpy as np +from app.dependencies import ( + model, error_memory, clean_traceback, split_traceback_layers, + compute_layered_similarity_sco, rebuild_index, + id_to_index, index_to_id +) +import faiss + +router = APIRouter() +aggregate_index = None + +class ErrorInsert(BaseModel): + error: str + db_id: str + +class ErrorQuery(BaseModel): + error: str + top_n: int + +@router.post("/insert", summary="插入数据并重新建立索引") +def insert_errors(errors: List[ErrorInsert]): + global aggregate_index, id_to_index, index_to_id + + for entry in errors: + cleaned = clean_traceback(entry.error) + layers = split_traceback_layers(cleaned) + layer_vectors = model.encode(layers) + agg_vector = np.mean(layer_vectors, axis=0) + + error_memory[entry.db_id] = { + "error": entry.error, + "vector": agg_vector, + "layers": layers, + "layer_vectors": layer_vectors + } + + aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory) + return {"inserted": [e.db_id for e in errors]} + +@router.get("/list", summary="获取已建立索引的所有 db_id") +def list_db_ids(): + return list(error_memory.keys()) + +@router.delete("/delete", summary="删除指定 db_id 的数据") +def delete_errors(ids: List[str] = Query(...)): + global aggregate_index, id_to_index, index_to_id + deleted = [] + + for db_id in ids: + if db_id in error_memory: + del error_memory[db_id] + deleted.append(db_id) + + aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory) + return {"deleted": deleted} + +@router.put("/update", summary="更新指定 db_id 的数据") +def update_error(error: ErrorInsert): + if error.db_id not in error_memory: + raise HTTPException(status_code=404, detail="db_id not found") + + cleaned = clean_traceback(error.error) + layers = split_traceback_layers(cleaned) + layer_vectors = model.encode(layers) + agg_vector = np.mean(layer_vectors, axis=0) + + error_memory[error.db_id] = { + "error": error.error, + "vector": agg_vector, + "layers": layers, + "layer_vectors": layer_vectors + } + + global aggregate_index, id_to_index, index_to_id + aggregate_index, id_to_index, index_to_id = rebuild_index(error_memory) + return {"updated": error.db_id} + +@router.post("/search", summary="搜索 top_n 个最相似的错误") +def search_error(query: ErrorQuery): + if not error_memory: + raise HTTPException(status_code=404, detail="数据库为空") + + cleaned = clean_traceback(query.error) + user_layers = split_traceback_layers(cleaned) + user_vectors = model.encode(user_layers) + + user_agg_vector = np.mean(user_vectors, axis=0).astype('float32').reshape(1, -1) + k = min(query.top_n, len(error_memory)) + sim, indices = aggregate_index.search(user_agg_vector, k) + + results = [] + for idx, score in zip(indices[0], sim[0]): + db_id = index_to_id[idx] + db_entry = error_memory[db_id] + layer_score = compute_layered_similarity_sco(user_vectors, db_entry["layer_vectors"]) + results.append({ + "db_id": db_id, + "similarity": round(layer_score, 4), + "matched_layers": db_entry["layers"] + }) + + return results diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..26fda00 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +fastapi +uvicorn +faiss-cpu +sentence-transformers +numpy + diff --git a/run.py b/run.py new file mode 100644 index 0000000..4753965 --- /dev/null +++ b/run.py @@ -0,0 +1,4 @@ +import uvicorn + +if __name__ == "__main__": + uvicorn.run("app.main:app", host="127.0.0.1", port=8000, reload=True)