import re import time import logging from sentence_transformers import SentenceTransformer import faiss import numpy as np # 配置日志 logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') # 初始化模型 model = SentenceTransformer('all-MiniLM-L6-v2') # 模拟错误数据库 error_db = [ { "error": """Traceback (most recent call last):\n File \"image_error.py\", line 55, in \n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n""", "solution": "可能是 labels 中的值超出了分类数量的范围。检查模型最后一层的输出维度是否正确。" }, { "error": """Traceback (most recent call last):\n File \"rnn_error.py\", line 1, in \n import torch\nModuleNotFoundError: No module named 'torch'\n""", "solution": "检查路径是否存在,或模型文件是否已正确保存。" }, { "error": """Traceback (most recent call last):\n File \"cnn_success.py\", line 51, in \n loss.backward()\n File \"_tensor.py\", line 521, in backward\n torch.autograd.backward(...)\n File \"__init__.py\", line 289, in backward\n _engine_run_backward(...)\n File \"graph.py\", line 768, in _engine_run_backward\n return run_backward(...)\nKeyboardInterrupt\n""", "solution": "程序被用户中断 (KeyboardInterrupt)。这不是一个典型的代码错误,可能是手动停止了程序。" }, { "error": """Traceback (most recent call last):\n File \"cnn_success.py\", line 1, in \n import torch\nModuleNotFoundError: No module named 'torch'\n""", "solution": "找不到模块torch, 请检查是否已安装 PyTorch。" }, { "error": """Traceback (most recent call last):\n File \"rnn_error.py\", line 40, in \n loss = criterion(output, label.unsqueeze(0))\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nValueError: Expected input batch_size (2) to match target batch_size (1).\n""", "solution": "检查模型输出和标签的形状是否匹配。确保它们在 batch_size 维度上是一致的。" }, { "error": """Syntax error in command""", "solution": "检查代码中的语法错误。可能是缺少括号、引号或其他语法问题。" } ] def clean_traceback(traceback_string): cleaned = traceback_string.replace('\r', '') cleaned = re.sub(r'^Traceback \(most recent call last\):[\n]+', '', cleaned, flags=re.MULTILINE) file_line_regex = re.compile(r'^ *File \"(.*?)\", line (\d+)(, in .*?)?$', re.MULTILINE) def replace_file_line(match): full_path = match.group(1) line_num = match.group(2) function_part = match.group(3) or '' filename = full_path.split('/')[-1].split('\\')[-1] return f' File "{filename}", line {line_num}{function_part}' cleaned = file_line_regex.sub(replace_file_line, cleaned) cleaned = re.sub(r'<.* at 0x[0-9a-fA-F]+>', '<...>', cleaned) cleaned = re.sub(r'\n\s*\n+', '\n', cleaned).strip() return cleaned def split_traceback_layers(traceback_string): """将traceback清洗后按行分割为多个frame,每一层都可以独立向量化匹配""" lines = traceback_string.strip().split("\n") return [line.strip() for line in lines if line.strip()] # 计时:创建数据库 start_db_time = time.time() # 初始化聚合索引 aggregate_vectors = [] for entry in error_db: cleaned = clean_traceback(entry["error"]) layers = split_traceback_layers(cleaned) embeddings = model.encode(layers) aggregate_vectors.append(np.mean(embeddings, axis=0)) # 使用平均向量代表整个错误 aggregate_vectors = np.array(aggregate_vectors).astype('float32') aggregate_index = faiss.IndexFlatIP(aggregate_vectors.shape[1]) aggregate_index.add(aggregate_vectors) # 构建分层向量数据库,增加 FAISS 索引 layered_error_db = [] for entry, agg_vec in zip(error_db, aggregate_vectors): cleaned = clean_traceback(entry["error"]) layers = split_traceback_layers(cleaned) embeddings = model.encode(layers) index = faiss.IndexFlatIP(embeddings.shape[1]) # 构建内积索引 index.add(embeddings.astype('float32')) # 添加向量到索引中 layered_error_db.append({ "layers": layers, "vectors": embeddings, # 保留原始向量 "index": index, # 添加 FAISS 索引 "solution": entry["solution"], "agg_vector": agg_vec # 添加聚合向量 }) end_db_time = time.time() logging.info(f"创建数据库耗时:{end_db_time - start_db_time:.2f} 秒") # 查询错误信息 # 完全相同 # user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in \n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1562, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1188, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n""" # 完全不同 # user_error = """"Traceback (most recent call last):\n File \"D:\\Develop\\Projects\\bash-hook\\matches\\perf_2.py\", line 94, in \n D, I = index.search(query_vec, k=1)\n File \"C:\\Users\\qcqcqc\\.conda\\envs\\sentence_transformers\\lib\\site-packages\\faiss\\class_wrappers.py\", line 329, in replacement_search\n assert d == self.d\nAssertionError\n""" # 底层不同 # user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in \n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n""" # 顶层不同 user_error = """Traceback (most recent call last):\n File \"train_model.py\", line 72, in \n loss = loss_function(predictions, ground_truth)\n File \"core.py\", line 442, in _call_function\n return self._execute(*args, **kwargs)\n File \"core.py\", line 450, in _execute\n return forward_execution(*args, **kwargs)\n File \"loss_functions.py\", line 205, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n""" # 无错误信息 # user_error = "success." cleaned_user = clean_traceback(user_error) user_layers = split_traceback_layers(cleaned_user) user_vectors = model.encode(user_layers) # 计算逐层匹配分数(平均最大匹配) def compute_layered_similarity_avg(user_vecs, db_index, layer_weights=None): score = 0.0 weighted_score = 0.0 if layer_weights is None: layer_weights = np.ones(len(user_vecs)) # 每层权重相同,默认为1 for i, u_vec in enumerate(user_vecs): u_vec = u_vec.astype('float32').reshape(1, -1) sim, _ = db_index.search(u_vec, k=1) # 获取当前层与数据库中最相似向量的相似度 max_sim = sim[0][0] weighted_score += max_sim * layer_weights[i] return weighted_score / np.sum(layer_weights) if len(user_vecs) > 0 else 0.0 def compute_layered_similarity_sco(user_vecs, db_index, layer_weights=None): weighted_score = 0.0 if layer_weights is None: # 使用对数函数生成非线性权重 layer_weights = np.logspace(0, 1, len(user_vecs)) # 从浅层到深层递增权重 layer_weights /= np.sum(layer_weights) # 归一化权重,使其和为1 for i, u_vec in enumerate(user_vecs): u_vec = u_vec.astype('float32').reshape(1, -1) sim, _ = db_index.search(u_vec, k=1) # 获取当前层与数据库中最相似向量的相似度 max_sim = sim[0][0] weighted_score += max_sim * layer_weights[i] return weighted_score if len(user_vecs) > 0 else 0.0 # 计时:查询 start_query_time = time.time() cleaned_user = clean_traceback(user_error) user_layers = split_traceback_layers(cleaned_user) user_vectors = model.encode(user_layers) user_agg_vector = np.mean(user_vectors, axis=0).astype('float32').reshape(1, -1) # 初步检索聚合索引,找到最接近的前5个问题 k = 5 sim, indices = aggregate_index.search(user_agg_vector, k) # 输出找到的序号 print(f"找到的前{k}个最接近的问题的序号: {indices[0]}") # 具体查找最合适的匹配 best_match = None best_score = -1 best_match_index = -1 # 用于记录最相似的错误堆栈的序号 for idx in indices[0]: db_entry = layered_error_db[idx] score = compute_layered_similarity_sco(user_vectors, db_entry["index"]) if score > best_score: best_score = score best_match = db_entry best_match_index = idx # 更新最相似的错误堆栈的序号 end_query_time = time.time() logging.info(f"查询耗时:{end_query_time - start_query_time:.2f} 秒") # 输出匹配结果 print("--- 清洗后的用户错误信息 ---") print(cleaned_user) print("\n--- 最相似的错误堆栈 ---") print("\n".join(best_match["layers"])) print("\n--- 建议的解决方案 ---") print(best_match["solution"]) print(f"\n--- 匹配度 (分层匹配平均最大相似度): {best_score:.4f}") print(f"\n--- 最相似的错误堆栈的序号: {best_match_index}")