bug fixed
This commit is contained in:
parent
18febe6558
commit
b2d4ce660d
85
perf.py
85
perf.py
|
|
@ -5,12 +5,32 @@ import faiss
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
|
import psutil
|
||||||
|
import os
|
||||||
|
# import pynvml
|
||||||
|
|
||||||
|
# 初始化 GPU 监控
|
||||||
|
# pynvml.nvmlInit()
|
||||||
|
# gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 默认用第一个GPU
|
||||||
|
|
||||||
|
def get_memory_info():
|
||||||
|
process = psutil.Process(os.getpid())
|
||||||
|
mem_info = process.memory_info()
|
||||||
|
return mem_info.rss / 1024 / 1024 # 单位 MB
|
||||||
|
|
||||||
|
def get_gpu_memory_info():
|
||||||
|
# mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
||||||
|
# used = mem.used / 1024 / 1024 # MB
|
||||||
|
# total = mem.total / 1024 / 1024
|
||||||
|
# return used, total
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
|
||||||
# 1. 初始化模型
|
# 1. 初始化模型
|
||||||
model = SentenceTransformer('all-MiniLM-L6-v2')
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
|
||||||
# 2. 模拟一个更大的错误数据库
|
# 2. 模拟一个更大的错误数据库
|
||||||
NUM_ERRORS = 1000
|
NUM_ERRORS = 100
|
||||||
error_db_large = []
|
error_db_large = []
|
||||||
|
|
||||||
def generate_random_traceback(num_lines=10):
|
def generate_random_traceback(num_lines=10):
|
||||||
|
|
@ -61,20 +81,32 @@ for entry in error_db_large:
|
||||||
"original_error": entry["error"],
|
"original_error": entry["error"],
|
||||||
"solution": entry["solution"]
|
"solution": entry["solution"]
|
||||||
})
|
})
|
||||||
|
|
||||||
cleaning_time = time.time() - start_time
|
cleaning_time = time.time() - start_time
|
||||||
print(f"清洗 {NUM_ERRORS} 条错误信息耗时: {cleaning_time:.4f} 秒")
|
print(f"清洗 {NUM_ERRORS} 条错误信息耗时: {cleaning_time:.4f} 秒")
|
||||||
|
print(f"内存使用: {get_memory_info():.2f} MB")
|
||||||
|
gpu_used, gpu_total = get_gpu_memory_info()
|
||||||
|
print(f"GPU 显存使用: {gpu_used:.2f} / {gpu_total:.2f} MB")
|
||||||
|
|
||||||
# 5. 向量化清洗后的错误信息并构建索引
|
# 5. 向量化清洗后的错误信息并构建索引
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
embeddings_large = np.array([model.encode(e['cleaned_error']) for e in cleaned_error_db_large])
|
embeddings_large = np.array([model.encode(e['cleaned_error']) for e in cleaned_error_db_large])
|
||||||
embedding_time = time.time() - start_time
|
embedding_time = time.time() - start_time
|
||||||
print(f"向量化 {NUM_ERRORS} 条错误信息耗时: {embedding_time:.4f} 秒")
|
print(f"向量化 {NUM_ERRORS} 条错误信息耗时: {embedding_time:.4f} 秒")
|
||||||
|
print(f"内存使用: {get_memory_info():.2f} MB")
|
||||||
|
gpu_used, gpu_total = get_gpu_memory_info()
|
||||||
|
print(f"GPU 显存使用: {gpu_used:.2f} / {gpu_total:.2f} MB")
|
||||||
|
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
index_large = faiss.IndexFlatL2(embeddings_large.shape[1])
|
index_large = faiss.IndexFlatL2(embeddings_large.shape[1])
|
||||||
index_large.add(embeddings_large)
|
index_large.add(embeddings_large)
|
||||||
index_building_time = time.time() - start_time
|
index_building_time = time.time() - start_time
|
||||||
print(f"构建包含 {NUM_ERRORS} 条向量的 FAISS 索引耗时: {index_building_time:.4f} 秒")
|
print(f"构建包含 {NUM_ERRORS} 条向量的 FAISS 索引耗时: {index_building_time:.4f} 秒")
|
||||||
|
print(f"内存使用: {get_memory_info():.2f} MB")
|
||||||
|
gpu_used, gpu_total = get_gpu_memory_info()
|
||||||
|
print(f"GPU 显存使用: {gpu_used:.2f} / {gpu_total:.2f} MB")
|
||||||
|
|
||||||
|
|
||||||
# 6. 用户报错(测试查询)
|
# 6. 用户报错(测试查询)
|
||||||
user_error_performance = generate_random_traceback(num_lines=12)
|
user_error_performance = generate_random_traceback(num_lines=12)
|
||||||
|
|
@ -82,23 +114,38 @@ cleaned_user_error_performance = clean_traceback(user_error_performance)
|
||||||
query_vec_performance = model.encode(cleaned_user_error_performance)
|
query_vec_performance = model.encode(cleaned_user_error_performance)
|
||||||
query_vec_performance = np.array([query_vec_performance])
|
query_vec_performance = np.array([query_vec_performance])
|
||||||
|
|
||||||
# 7. 在索引中搜索最相似的错误
|
|
||||||
start_time = time.time()
|
|
||||||
D_performance, I_performance = index_large.search(query_vec_performance, k=1)
|
|
||||||
search_time = time.time() - start_time
|
|
||||||
print(f"在包含 {NUM_ERRORS} 条向量的 FAISS 索引中搜索耗时: {search_time:.6f} 秒")
|
|
||||||
|
|
||||||
# 8. 输出结果
|
NUM_QUERIES = 100 # 查询次数
|
||||||
matched_index_performance = I_performance[0][0]
|
k = 10 # 每次查询返回前 k 个最相似的错误
|
||||||
distance_performance = D_performance[0][0]
|
|
||||||
cosine_similarity_performance = 1 - (distance_performance**2) / 2
|
|
||||||
matched_entry_performance = cleaned_error_db_large[matched_index_performance]
|
|
||||||
|
|
||||||
print("\n--- 性能测试结果 ---")
|
query_times = []
|
||||||
print("\n--- 清洗后的用户错误信息 ---")
|
similarities = []
|
||||||
print(cleaned_user_error_performance)
|
|
||||||
print("\n--- 最相似的已知错误信息 ---")
|
for i in range(NUM_QUERIES):
|
||||||
print(matched_entry_performance['cleaned_error'])
|
user_error = generate_random_traceback(num_lines=random.randint(8, 14))
|
||||||
print("\n--- 建议的解决方案 ---")
|
cleaned_user_error = clean_traceback(user_error)
|
||||||
print(matched_entry_performance['solution'])
|
query_vec = model.encode(cleaned_user_error)
|
||||||
print(f"\n--- 匹配度 (Cosine Similarity): {cosine_similarity_performance:.4f}")
|
query_vec = np.array([query_vec])
|
||||||
|
|
||||||
|
start_query_time = time.time()
|
||||||
|
D, I = index_large.search(query_vec, k=k)
|
||||||
|
elapsed = time.time() - start_query_time
|
||||||
|
query_times.append(elapsed)
|
||||||
|
|
||||||
|
matched_index = I[0][0]
|
||||||
|
distance = D[0][0]
|
||||||
|
similarity = 1 - (distance ** 2) / 2
|
||||||
|
similarities.append(similarity)
|
||||||
|
|
||||||
|
# 输出统计结果
|
||||||
|
print(f"\n--- 多次查询性能测试 ---")
|
||||||
|
print(f"总查询次数: {NUM_QUERIES}")
|
||||||
|
print(f"平均查询耗时: {np.mean(query_times) * 1000:.4f} ms")
|
||||||
|
print(f"最小查询耗时: {np.min(query_times) * 1000:.4f} ms")
|
||||||
|
print(f"最大查询耗时: {np.max(query_times) * 1000:.4f} ms")
|
||||||
|
print(f"查询耗时标准差: {np.std(query_times) * 1000:.4f} ms")
|
||||||
|
print(f"平均匹配相似度: {np.mean(similarities):.4f}")
|
||||||
|
print(f"相似度标准差: {np.std(similarities):.4f}")
|
||||||
|
print(f"内存使用: {get_memory_info():.2f} MB")
|
||||||
|
gpu_used, gpu_total = get_gpu_memory_info()
|
||||||
|
print(f"GPU 显存使用: {gpu_used:.2f} / {gpu_total:.2f} MB")
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ for entry, agg_vec in zip(error_db, aggregate_vectors):
|
||||||
|
|
||||||
layered_error_db.append({
|
layered_error_db.append({
|
||||||
"layers": layers,
|
"layers": layers,
|
||||||
"vectors": embeddings, # 保留原始向量(可选)
|
"vectors": embeddings, # 保留原始向量
|
||||||
"index": index, # 添加 FAISS 索引
|
"index": index, # 添加 FAISS 索引
|
||||||
"solution": entry["solution"],
|
"solution": entry["solution"],
|
||||||
"agg_vector": agg_vec # 添加聚合向量
|
"agg_vector": agg_vec # 添加聚合向量
|
||||||
|
|
@ -105,9 +105,9 @@ logging.info(f"创建数据库耗时:{end_db_time - start_db_time:.2f} 秒")
|
||||||
# 完全不同
|
# 完全不同
|
||||||
# user_error = """"Traceback (most recent call last):\n File \"D:\\Develop\\Projects\\bash-hook\\matches\\perf_2.py\", line 94, in <module>\n D, I = index.search(query_vec, k=1)\n File \"C:\\Users\\qcqcqc\\.conda\\envs\\sentence_transformers\\lib\\site-packages\\faiss\\class_wrappers.py\", line 329, in replacement_search\n assert d == self.d\nAssertionError\n"""
|
# user_error = """"Traceback (most recent call last):\n File \"D:\\Develop\\Projects\\bash-hook\\matches\\perf_2.py\", line 94, in <module>\n D, I = index.search(query_vec, k=1)\n File \"C:\\Users\\qcqcqc\\.conda\\envs\\sentence_transformers\\lib\\site-packages\\faiss\\class_wrappers.py\", line 329, in replacement_search\n assert d == self.d\nAssertionError\n"""
|
||||||
# 底层不同
|
# 底层不同
|
||||||
user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n"""
|
# user_error = """Traceback (most recent call last):\n File \"image_error.py\", line 55, in <module>\n loss = criterion(outputs, labels)\n File \"module.py\", line 1553, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n File \"module.py\", line 1570, in _call_impl\n return forward_call(*args, **kwargs)\n File \"loss.py\", line 1200, in forward\n return F.mse_loss(input, target, reduction=self.reduction)\n File \"functional.py\", line 2301, in mse_loss\n return mean_squared_error_loss(input, target, ...)\nValueError: The size of input tensor must match the size of target tensor.\n"""
|
||||||
# 顶层不同
|
# 顶层不同
|
||||||
# user_error = """Traceback (most recent call last):\n File \"train_model.py\", line 72, in <module>\n loss = loss_function(predictions, ground_truth)\n File \"core.py\", line 442, in _call_function\n return self._execute(*args, **kwargs)\n File \"core.py\", line 450, in _execute\n return forward_execution(*args, **kwargs)\n File \"loss_functions.py\", line 205, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
|
user_error = """Traceback (most recent call last):\n File \"train_model.py\", line 72, in <module>\n loss = loss_function(predictions, ground_truth)\n File \"core.py\", line 442, in _call_function\n return self._execute(*args, **kwargs)\n File \"core.py\", line 450, in _execute\n return forward_execution(*args, **kwargs)\n File \"loss_functions.py\", line 205, in forward\n return F.cross_entropy(input, target, weight=self.weight,\n File \"functional.py\", line 3104, in cross_entropy\n return cross_entropy_loss(input, target, ...)\nIndexError: Target 15 is out of bounds.\n"""
|
||||||
# 无错误信息
|
# 无错误信息
|
||||||
# user_error = "success."
|
# user_error = "success."
|
||||||
|
|
||||||
|
|
@ -134,11 +134,12 @@ def compute_layered_similarity_avg(user_vecs, db_index, layer_weights=None):
|
||||||
|
|
||||||
|
|
||||||
def compute_layered_similarity_sco(user_vecs, db_index, layer_weights=None):
|
def compute_layered_similarity_sco(user_vecs, db_index, layer_weights=None):
|
||||||
score = 0.0
|
|
||||||
weighted_score = 0.0
|
weighted_score = 0.0
|
||||||
|
|
||||||
if layer_weights is None:
|
if layer_weights is None:
|
||||||
layer_weights = np.linspace(0.1, 1, len(user_vecs)) # 默认从浅层到深层递增权重
|
# 使用对数函数生成非线性权重
|
||||||
|
layer_weights = np.logspace(0, 1, len(user_vecs)) # 从浅层到深层递增权重
|
||||||
|
layer_weights /= np.sum(layer_weights) # 归一化权重,使其和为1
|
||||||
|
|
||||||
for i, u_vec in enumerate(user_vecs):
|
for i, u_vec in enumerate(user_vecs):
|
||||||
u_vec = u_vec.astype('float32').reshape(1, -1)
|
u_vec = u_vec.astype('float32').reshape(1, -1)
|
||||||
|
|
@ -147,7 +148,7 @@ def compute_layered_similarity_sco(user_vecs, db_index, layer_weights=None):
|
||||||
|
|
||||||
weighted_score += max_sim * layer_weights[i]
|
weighted_score += max_sim * layer_weights[i]
|
||||||
|
|
||||||
return weighted_score / np.sum(layer_weights) if len(user_vecs) > 0 else 0.0
|
return weighted_score if len(user_vecs) > 0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
# 计时:查询
|
# 计时:查询
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue