model_name = 'maidalun1020/bce-embedding-base_v1'
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'batch_size': 64, 'normalize_embeddings': True, 'show_progress_bar': False}

embed_model = HuggingFaceEmbeddings(



Use in LlamaIndex the BCEReranker import error

I install the llama-index-0.10.14, according the Demo :from import BCERerank
when i import it report:
ModuleNotFoundError Traceback (most recent call last)
Cell In[24], line 14
12 from llama_index.vector_stores.postgres import PGVectorStore
13 import os
---> 14 from import BCERerank
15 from flask import Flask, request, jsonify

File ~/anaconda3/lib/python3.11/site-packages/BCEmbedding/tools/llama_index/
1 '''
2 @description:
3 @author: shenlei
6 @LastEditors: shenlei
7 '''
----> 8 from .bce_rerank import BCERerank

File ~/anaconda3/lib/python3.11/site-packages/BCEmbedding/tools/llama_index/
1 '''
2 @description:
3 @author: shenlei
6 @LastEditors: shenlei
7 '''
8 from typing import Any, List, Optional
---> 10 from llama_index.bridge.pydantic import Field, PrivateAttr
11 from llama_index.callbacks import CBEventType, EventPayload
12 from llama_index.postprocessor.types import BaseNodePostprocessor

ModuleNotFoundError: No module named 'llama_index.bridge'`

and i tried to uninstall llama-index reinstall it,but it not works,if you see this issus please give me some suggest.



ImportError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import torch
2 import os
3 import re

File ~/.conda/envs/bce/lib/python3.10/site-packages/torch/
234 _load_global_deps()
--> 235 from torch._C import * # noqa: F403
237 # Appease the type checker; ordinarily this binding is inserted by the
238 # torch._C module initialization code in C

ImportError: /home/powerop/.conda/envs/bce/lib/python3.10/site-packages/torch/lib/ undefined symbol: ncclCommInitRankConfig




请问bce embedding以及bce rerank模型对表格语义的拟合能力如何?训练数据中是否会存在表格?如果存在的话是以markdown的形式体现的吗?





我使用 sentence_transformers 进行部署,
输入: [[你是谁,你是谁],[你是谁,今年几岁]]
输出 [0.625,0.425]
为什么第一个 pair 算出的分数这么低


reranker = BCERerank(model="./bce-reranker-base_v1", top_n=5, device='cuda:0')


ValueError Traceback (most recent call last)
Cell In[6], line 39
31 embed_model = HuggingFaceEmbeddings(
32 model_name=embedding_model_name,
33 model_kwargs=embedding_model_kwargs,
34 encode_kwargs=embedding_encode_kwargs
35 )
36 # 创建一个reranker模型
37 # reranker_args = {'model': './bce-reranker-base_v1', 'top_n': 5, 'device': 'cuda:0'}
38 # reranker = BCERerank()
---> 39 reranker = BCERerank(model="./bce-reranker-base_v1", top_n=5, device='cuda:0')

File /mnt/workspace/BCEmbedding/BCEmbedding/tools/langchain/, in BCERerank.init(self, top_n, model, device, **kwargs)
50 except ImportError:
51 raise ImportError(
52 "Cannot import BCEmbedding package,",
53 "please pip install BCEmbedding>=0.1.2",
54 )
---> 55 self._model = RerankerModel(model_name_or_path=model, device=device, **kwargs)
56 super().init(top_n=top_n, model=model)

File /opt/conda/lib/python3.10/site-packages/pydantic/v1/, in BaseModel.setattr(self, name, value)
354 return object_setattr(self, name, value)
356 if self.config.extra is not Extra.allow and name not in self.fields:
--> 357 raise ValueError(f'"{}" object has no field "{name}"')
358 elif not self.config.allow_mutation or self.config.frozen:
359 raise TypeError(f'"{}" is immutable and does not support item assignment')

ValueError: "BCERerank" object has no field "_model"

Request for non-NVIDIA GPU compatibility

问题: 在执行embedding and Reranker Integrations for RAG Frameworks例子的时候, 两种方式都会报以下错误

Traceback (most recent call last):
  File "/home/zc/miniconda3/BCEmbedding/", line 18, in <module>
    embed_model = HuggingFaceEmbeddings(
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/langchain_community/embeddings/", line 65, in __init__
    self.client = sentence_transformers.SentenceTransformer(
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/sentence_transformers/", line 215, in __init__
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/nn/modules/", line 1152, in to
    return self._apply(convert)
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/nn/modules/", line 802, in _apply
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/nn/modules/", line 802, in _apply
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/nn/modules/", line 802, in _apply
  [Previous line repeated 1 more time]
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/nn/modules/", line 825, in _apply
    param_applied = fn(param)
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/nn/modules/", line 1150, in convert
    return, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
  File "/home/zc/miniconda3/envs/bce/lib/python3.10/site-packages/torch/cuda/", line 302, in _lazy_init
RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from

是否有例子可以增加对其他类型GPU的支持,或者有没有什么其他方案, 感谢感谢



我关注到bce-reranker-base_v1使用的base model只支持512长度的输入(position embedding限制了长度),大于512长度则是通过“把长passage分成多个chunk,每个chunk分别求score,在取max”的形式。我担心这样的做法还是会丢失一部分长文本的原始语义,有办法让模型支持原生的passage输入超过512吗





04/29/2024 15:58:03 - [INFO] -BCEmbedding.models.RerankerModel->>> Loading from /workspace/bce-reranker-base_v1.
04/29/2024 15:58:04 - [INFO] -BCEmbedding.models.RerankerModel->>> Execute device: cuda; gpu num: 2; use fp16: False
Calculate scores: 0%| | 0/1 [00:00<?, ?it/s]Bus error (core dumped)

如上日志, 使用两块3090gpu;
在Docker内跑, 就必现的崩溃;
如果是改为CPU跑,就没有问题,但是cpu跑的比较慢, 按秒算;


[2024-04-05 03:07:58 WARNING] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.

model = AutoModel.from_pretrained('./bce-emb')

def make_train_dummy_input(seq_len):
    org_input_ids = torch.tensor(
        [[i for i in range(seq_len)]], dtype=torch.int32)
    org_input_mask = torch.tensor([[1 for i in range(int(
        seq_len/2))] + [1 for i in range(seq_len - int(seq_len/2))]], dtype=torch.int32)
    return (,


with torch.no_grad():
    org_dummy_input = make_train_dummy_input(64)
    # print(org_dummy_input)
    output = torch.onnx.export(model,
                               # 需要注意顺序!不可随意改变, 否则结果与预期不符
                                   'input_ids', 'attention_mask'],
                               # 需要注意顺序, 否则在推理阶段可能用错output_names
                               dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence_length"},
                                             "attention_mask": {0: "batch_size", 1: "sequence_length"},
                                             "logits": {0: "batch_size"}

trt转换如下:环境为 nvidia的官方docker,tensorrt版本为8.6.1。

trtexec --onnx=/workspace/bce-emb.onnx \
--saveEngine=/workspace/model.plan \
--minShapes=input_ids:1x1,attention_mask:1x1 \
--optShapes=input_ids:4x128,attention_mask:4x128 \
--maxShapes=input_ids:64x512,attention_mask:64x512 \


[04/05/2024-12:27:54] [I] === Model Options ===
[04/05/2024-12:27:54] [I] Format: ONNX
[04/05/2024-12:27:54] [I] Model: /workspace/bce-emb.onnx
[04/05/2024-12:27:54] [I] Output:
[04/05/2024-12:27:54] [I] === Build Options ===
[04/05/2024-12:27:54] [I] Max batch: explicit batch
[04/05/2024-12:27:54] [I] Memory Pools: workspace: 8192 MiB, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[04/05/2024-12:27:54] [I] minTiming: 1
[04/05/2024-12:27:54] [I] avgTiming: 8
[04/05/2024-12:27:54] [I] Precision: FP32
[04/05/2024-12:27:54] [I] LayerPrecisions:
[04/05/2024-12:27:54] [I] Layer Device Types:
[04/05/2024-12:27:54] [I] Calibration:
[04/05/2024-12:27:54] [I] Refit: Disabled
[04/05/2024-12:27:54] [I] Version Compatible: Disabled
[04/05/2024-12:27:54] [I] TensorRT runtime: full
[04/05/2024-12:27:54] [I] Lean DLL Path:
[04/05/2024-12:27:54] [I] Tempfile Controls: { in_memory: allow, temporary: allow }
[04/05/2024-12:27:54] [I] Exclude Lean Runtime: Disabled
[04/05/2024-12:27:54] [I] Sparsity: Disabled
[04/05/2024-12:27:54] [I] Safe mode: Disabled
[04/05/2024-12:27:54] [I] Build DLA standalone loadable: Disabled
[04/05/2024-12:27:54] [I] Allow GPU fallback for DLA: Disabled
[04/05/2024-12:27:54] [I] DirectIO mode: Disabled
[04/05/2024-12:27:54] [I] Restricted mode: Disabled
[04/05/2024-12:27:54] [I] Skip inference: Disabled
[04/05/2024-12:27:54] [I] Save engine: /workspace/model.plan
[04/05/2024-12:27:54] [I] Load engine:
[04/05/2024-12:27:54] [I] Profiling verbosity: 0
[04/05/2024-12:27:54] [I] Tactic sources: Using default tactic sources
[04/05/2024-12:27:54] [I] timingCacheMode: local
[04/05/2024-12:27:54] [I] timingCacheFile:
[04/05/2024-12:27:54] [I] Heuristic: Disabled
[04/05/2024-12:27:54] [I] Preview Features: Use default preview flags.
[04/05/2024-12:27:54] [I] MaxAuxStreams: -1
[04/05/2024-12:27:54] [I] BuilderOptimizationLevel: -1
[04/05/2024-12:27:54] [I] Input(s)s format: fp32:CHW
[04/05/2024-12:27:54] [I] Output(s)s format: fp32:CHW
[04/05/2024-12:27:54] [I] Input build shape: input_ids=1x1+4x128+64x512
[04/05/2024-12:27:54] [I] Input build shape: attention_mask=1x1+4x128+64x512
[04/05/2024-12:27:54] [I] Input calibration shapes: model
[04/05/2024-12:27:54] [I] === System Options ===
[04/05/2024-12:27:54] [I] Device: 0
[04/05/2024-12:27:54] [I] DLACore:
[04/05/2024-12:27:54] [I] Plugins:
[04/05/2024-12:27:54] [I] setPluginsToSerialize:
[04/05/2024-12:27:54] [I] dynamicPlugins:
[04/05/2024-12:27:54] [I] ignoreParsedPluginLibs: 0
[04/05/2024-12:27:54] [I]
[04/05/2024-12:27:54] [I] === Inference Options ===
[04/05/2024-12:27:54] [I] Batch: Explicit
[04/05/2024-12:27:54] [I] Input inference shape: attention_mask=4x128
[04/05/2024-12:27:54] [I] Input inference shape: input_ids=4x128
[04/05/2024-12:27:54] [I] Iterations: 10
[04/05/2024-12:27:54] [I] Duration: 3s (+ 200ms warm up)
[04/05/2024-12:27:54] [I] Sleep time: 0ms
[04/05/2024-12:27:54] [I] Idle time: 0ms
[04/05/2024-12:27:54] [I] Inference Streams: 1
[04/05/2024-12:27:54] [I] ExposeDMA: Disabled
[04/05/2024-12:27:54] [I] Data transfers: Enabled
[04/05/2024-12:27:54] [I] Spin-wait: Disabled
[04/05/2024-12:27:54] [I] Multithreading: Disabled
[04/05/2024-12:27:54] [I] CUDA Graph: Disabled
[04/05/2024-12:27:54] [I] Separate profiling: Disabled
[04/05/2024-12:27:54] [I] Time Deserialize: Disabled
[04/05/2024-12:27:54] [I] Time Refit: Disabled
[04/05/2024-12:27:54] [I] NVTX verbosity: 0
[04/05/2024-12:27:54] [I] Persistent Cache Ratio: 0
[04/05/2024-12:27:54] [I] Inputs:
[04/05/2024-12:27:54] [I] === Reporting Options ===
[04/05/2024-12:27:54] [I] Verbose: Disabled
[04/05/2024-12:27:54] [I] Averages: 10 inferences
[04/05/2024-12:27:54] [I] Percentiles: 90,95,99
[04/05/2024-12:27:54] [I] Dump refittable layers:Disabled
[04/05/2024-12:27:54] [I] Dump output: Disabled
[04/05/2024-12:27:54] [I] Profile: Disabled
[04/05/2024-12:27:54] [I] Export timing to JSON file:
[04/05/2024-12:27:54] [I] Export output to JSON file:
[04/05/2024-12:27:54] [I] Export profile to JSON file:
[04/05/2024-12:27:54] [I]
[04/05/2024-12:27:54] [I] === Device Information ===
[04/05/2024-12:27:54] [I] Selected Device: NVIDIA A10
[04/05/2024-12:27:54] [I] Compute Capability: 8.6
[04/05/2024-12:27:54] [I] SMs: 72
[04/05/2024-12:27:54] [I] Device Global Memory: 22731 MiB
[04/05/2024-12:27:54] [I] Shared Memory per SM: 100 KiB
[04/05/2024-12:27:54] [I] Memory Bus Width: 384 bits (ECC enabled)
[04/05/2024-12:27:54] [I] Application Compute Clock Rate: 1.695 GHz
[04/05/2024-12:27:54] [I] Application Memory Clock Rate: 6.251 GHz
[04/05/2024-12:27:54] [I]
[04/05/2024-12:27:54] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
[04/05/2024-12:27:54] [I]
[04/05/2024-12:27:54] [I] TensorRT version: 8.6.1
[04/05/2024-12:27:54] [I] Loading standard plugins
[04/05/2024-12:27:55] [I] [TRT] [MemUsageChange] Init CUDA: CPU +520, GPU +0, now: CPU 537, GPU 13924 (MiB)
[04/05/2024-12:28:01] [I] [TRT] [MemUsageChange] Init builder kernel library: CPU +1436, GPU +266, now: CPU 2050, GPU 14190 (MiB)
[04/05/2024-12:28:01] [W] [TRT] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usageand speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation
[04/05/2024-12:28:01] [I] Start parsing network model.
[libprotobuf WARNING google/protobuf/io/] Reading dangerously large protocol message.  If the message turns out to be larger than 2147483647 bytes, parsing will be halted for security reasons.  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
[libprotobuf WARNING google/protobuf/io/] The total number of bytes read was 1118829273
[04/05/2024-12:28:09] [I] [TRT] ----------------------------------------------------------------
[04/05/2024-12:28:09] [I] [TRT] Input filename:   /workspace/bce-emb.onnx
[04/05/2024-12:28:09] [I] [TRT] ONNX IR version:  0.0.8
[04/05/2024-12:28:09] [I] [TRT] Opset version:    17
[04/05/2024-12:28:09] [I] [TRT] Producer name:    pytorch
[04/05/2024-12:28:09] [I] [TRT] Producer version: 2.1.2
[04/05/2024-12:28:09] [I] [TRT] Domain:
[04/05/2024-12:28:09] [I] [TRT] Model version:    0
[04/05/2024-12:28:09] [I] [TRT] Doc string:
[04/05/2024-12:28:09] [I] [TRT] ----------------------------------------------------------------
[libprotobuf WARNING google/protobuf/io/] Reading dangerously large protocol message.  If the message turns out to be larger than 2147483647 bytes, parsing will be halted for security reasons.  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
[libprotobuf WARNING google/protobuf/io/] The total number of bytes read was 1118829273
[04/05/2024-12:28:11] [W] [TRT] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[04/05/2024-12:28:12] [I] Finished parsing network model. Parse time: 10.2094
[04/05/2024-12:28:12] [I] [TRT] Graph optimization time: 0.0657748 seconds.
[04/05/2024-12:28:12] [I] [TRT] Local timing cache in use. Profiling results in this builder pass will not be stored.
[04/05/2024-12:28:28] [I] [TRT] Detected 2 inputs and 2 output network tensors.
[04/05/2024-12:28:31] [I] [TRT] Total Host Persistent Memory: 48
[04/05/2024-12:28:31] [I] [TRT] Total Device Persistent Memory: 0
[04/05/2024-12:28:31] [I] [TRT] Total Scratch Memory: 2114454528
[04/05/2024-12:28:31] [I] [TRT] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 1060 MiB, GPU 3512MiB
[04/05/2024-12:28:31] [I] [TRT] [BlockAssignment] Started assigning block shifts. This will take 2 steps to complete.
[04/05/2024-12:28:31] [I] [TRT] [BlockAssignment] Algorithm ShiftNTopDown took 0.013715ms to assign 2 blocks to 2 nodes requiring 2114455040 bytes.
[04/05/2024-12:28:31] [I] [TRT] Total Activation Memory: 2114455040
[04/05/2024-12:28:31] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +0, GPU +2048, now: CPU 0, GPU 2048 (MiB)
[04/05/2024-12:28:39] [I] Engine built in 44.7706 sec.
[04/05/2024-12:28:40] [I] [TRT] Loaded engine size: 1063 MiB
[04/05/2024-12:28:40] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +1060,now: CPU 0, GPU 1060 (MiB)
[04/05/2024-12:28:40] [I] Engine deserialized in 0.121818 sec.
[04/05/2024-12:28:40] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +2017, now: CPU 0, GPU 3077 (MiB)
[04/05/2024-12:28:40] [W] [TRT] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usageand speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation
[04/05/2024-12:28:40] [I] Setting persistentCacheLimit to 0 bytes.
[04/05/2024-12:28:40] [I] Using random values for input input_ids
[04/05/2024-12:28:40] [I] Input binding for input_ids with dimensions 4x128 is created.
[04/05/2024-12:28:40] [I] Using random values for input attention_mask
[04/05/2024-12:28:40] [I] Input binding for attention_mask with dimensions 4x128 is created.
[04/05/2024-12:28:40] [I] Output binding for logits with dimensions 4x128x768 is created.
[04/05/2024-12:28:40] [I] Output binding for 1488 with dimensions 4x768 is created.
[04/05/2024-12:28:40] [I] Starting inference
[04/05/2024-12:28:43] [I] Warmup completed 44 queries over 200 ms
[04/05/2024-12:28:43] [I] Timing trace has 634 queries over 3.01157 s
[04/05/2024-12:28:43] [I]
[04/05/2024-12:28:43] [I] === Trace details ===
[04/05/2024-12:28:43] [I] Trace averages of 10 runs:
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71777 ms - Host latency: 4.8111 ms (enqueue 4.68778 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.69606 ms - Host latency: 4.78887 ms (enqueue 4.66904 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71172 ms - Host latency: 4.80529 ms (enqueue 4.6842 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71255 ms - Host latency: 4.80536 ms (enqueue 4.68546 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71368 ms - Host latency: 4.8067 ms (enqueue 4.68657 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71378 ms - Host latency: 4.8076 ms (enqueue 4.68813 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.7095 ms - Host latency: 4.80175 ms (enqueue 4.68302 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71083 ms - Host latency: 4.80322 ms (enqueue 4.68549 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70866 ms - Host latency: 4.80078 ms (enqueue 4.68127 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71213 ms - Host latency: 4.80558 ms (enqueue 4.68696 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.7102 ms - Host latency: 4.80333 ms (enqueue 4.68237 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.7098 ms - Host latency: 4.80272 ms (enqueue 4.68302 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71245 ms - Host latency: 4.80533 ms (enqueue 4.68322 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.79283 ms - Host latency: 4.88607 ms (enqueue 4.73782 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 5.08766 ms - Host latency: 5.18107 ms (enqueue 5.05737 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.84741 ms - Host latency: 4.94027 ms (enqueue 4.84573 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.74828 ms - Host latency: 4.84208 ms (enqueue 4.72477 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71704 ms - Host latency: 4.81016 ms (enqueue 4.68956 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70856 ms - Host latency: 4.80144 ms (enqueue 4.6801 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.711 ms - Host latency: 4.80448 ms (enqueue 4.68986 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71707 ms - Host latency: 4.8125 ms (enqueue 4.68035 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71266 ms - Host latency: 4.80608 ms (enqueue 4.68488 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71327 ms - Host latency: 4.8064 ms (enqueue 4.68513 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70937 ms - Host latency: 4.80234 ms (enqueue 4.684 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71021 ms - Host latency: 4.80277 ms (enqueue 4.68435 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70916 ms - Host latency: 4.80209 ms (enqueue 4.68029 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71064 ms - Host latency: 4.80372 ms (enqueue 4.68414 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71089 ms - Host latency: 4.80433 ms (enqueue 4.68595 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71422 ms - Host latency: 4.80765 ms (enqueue 4.68622 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71155 ms - Host latency: 4.80374 ms (enqueue 4.68427 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70978 ms - Host latency: 4.80316 ms (enqueue 4.68428 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71857 ms - Host latency: 4.81082 ms (enqueue 4.68978 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71797 ms - Host latency: 4.8114 ms (enqueue 4.68748 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.80277 ms - Host latency: 4.89617 ms (enqueue 4.76615 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.90116 ms - Host latency: 4.99402 ms (enqueue 4.87052 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.84751 ms - Host latency: 4.94182 ms (enqueue 4.83071 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.75146 ms - Host latency: 4.84404 ms (enqueue 4.72413 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.74745 ms - Host latency: 4.84027 ms (enqueue 4.72244 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.74814 ms - Host latency: 4.84119 ms (enqueue 4.72034 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.75085 ms - Host latency: 4.8439 ms (enqueue 4.72666 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70796 ms - Host latency: 4.79934 ms (enqueue 4.68162 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71643 ms - Host latency: 4.80862 ms (enqueue 4.68887 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70688 ms - Host latency: 4.79854 ms (enqueue 4.67922 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70933 ms - Host latency: 4.8021 ms (enqueue 4.68403 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71401 ms - Host latency: 4.80779 ms (enqueue 4.68694 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70757 ms - Host latency: 4.80063 ms (enqueue 4.67991 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.70662 ms - Host latency: 4.79973 ms (enqueue 4.67981 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.71475 ms - Host latency: 4.80798 ms (enqueue 4.68315 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.7467 ms - Host latency: 4.83914 ms (enqueue 4.71975 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.73523 ms - Host latency: 4.828 ms (enqueue 4.70801 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.74326 ms - Host latency: 4.83728 ms (enqueue 4.71604 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.72288 ms - Host latency: 4.81548 ms (enqueue 4.6989 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.74504 ms - Host latency: 4.83687 ms (enqueue 4.71489 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.76904 ms - Host latency: 4.86096 ms (enqueue 4.73633 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.82097 ms - Host latency: 4.91309 ms (enqueue 4.79429 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.75395 ms - Host latency: 4.84707 ms (enqueue 4.73091 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.78035 ms - Host latency: 4.87405 ms (enqueue 4.74929 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.76145 ms - Host latency: 4.85464 ms (enqueue 4.73542 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.77607 ms - Host latency: 4.86899 ms (enqueue 4.74966 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.76257 ms - Host latency: 4.85547 ms (enqueue 4.73748 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.75317 ms - Host latency: 4.84756 ms (enqueue 4.72434 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.7521 ms - Host latency: 4.84453 ms (enqueue 4.72729 ms)
[04/05/2024-12:28:43] [I] Average on 10 runs - GPU latency: 4.74985 ms - Host latency: 4.84331 ms (enqueue 4.72151 ms)
[04/05/2024-12:28:43] [I]
[04/05/2024-12:28:43] [I] === Performance summary ===
[04/05/2024-12:28:43] [I] Throughput: 210.521 qps
[04/05/2024-12:28:43] [I] Latency: min = 4.78314 ms, max = 5.24432 ms, mean = 4.8347 ms, median = 4.80896 ms, percentile(90%) = 4.88477 ms, percentile(95%) = 4.93652 ms, percentile(99%) = 5.16418 ms
[04/05/2024-12:28:43] [I] Enqueue Time: min = 4.5 ms, max = 5.13123 ms, mean = 4.71434 ms, median = 4.69315 ms, percentile(90%) = 4.76709 ms, percentile(95%) = 4.8573 ms, percentile(99%) = 5.03857 ms
[04/05/2024-12:28:43] [I] H2D Latency: min = 0.00610352 ms, max = 0.0211182 ms, mean = 0.00693844 ms, median = 0.00683594 ms, percentile(90%) = 0.00756836 ms, percentile(95%) = 0.0078125 ms, percentile(99%) = 0.00830078 ms
[04/05/2024-12:28:43] [I] GPU Compute Time: min = 4.68994 ms, max = 5.15076 ms, mean = 4.74169 ms, median = 4.71545 ms, percentile(90%) = 4.79224 ms, percentile(95%) = 4.84351 ms, percentile(99%) = 5.07086 ms
[04/05/2024-12:28:43] [I] D2H Latency: min = 0.081543 ms, max = 0.0933533 ms, mean = 0.0860713 ms, median = 0.0859375 ms, percentile(90%) = 0.0877686 ms, percentile(95%) = 0.0881348 ms, percentile(99%) = 0.0895996 ms
[04/05/2024-12:28:43] [I] Total Host Walltime: 3.01157 s
[04/05/2024-12:28:43] [I] Total GPU Compute Time: 3.00623 s
[04/05/2024-12:28:43] [W] * Throughput may be bound by Enqueue Time rather than GPU Compute and the GPU may be under-utilized.
[04/05/2024-12:28:43] [W]   If not already in use, --useCudaGraph (utilize CUDA graphs where possible) may increase the throughput.
[04/05/2024-12:28:43] [W] * GPU compute time is unstable, with coefficient of variance = 1.33615%.
[04/05/2024-12:28:43] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
[04/05/2024-12:28:43] [I] Explanations of the performance metrics are printed in the verbose logs.

查看过log信息,有些异常的就只有INT64转为INT32的warring。请问你们遇到过这种问题吗?能提供一些参考的思路么 万分感谢。


[["what is panda", "panda is an animal"]] 能得到和pytorch一致的推理结果。
但在执行[["what is panda", "panda is an animal"],["what is panda", "panda is an animal"] ]时triton推理出来未经处理的结果和pytorch完全不一致。就很奇怪... 请问对于rerank有没有开源的onnx或者plan模型呢?如果我解决上述问题也愿意贡献可直接使用的转换模型。

BCE Fine tuning

您好,我在您的项目基础上构建了一个端到端的文本匹配模型,使用了BCEmbedding进行Feature Extraction,余弦相似度进行相似比较召回以及Reranker进行精排,取得了不错的效果。

现在希望进行Fine tuning,想在我们的细分领域下有更好的表现,请问您是否有更新Fine tuning相关内容的后续计划呢?




关于bce reranker模型,文本长度是多少?若是超过长度,他的处理机制是什么?


['运费是多少?', '打电话0.1元每分钟,短信0.1元每条扣费的喔亲亲~,接听电话免费']
['运费是多少', '打电话0.1元每分钟,短信0.1元每条扣费的喔亲亲~,接听电话免费']


  1. 这两个例子很明显分数都偏高了,语义完全不相关的
  2. 第一个例子只比第二个例子的query多了一个问号,分数就高很多,这个感觉也不合理

AttributeError: 'SequenceClassifierOutput' object has no attribute 'last_hidden_state'

from transformers import AutoModel, AutoTokenizer

# list of sentences
sentences = ['sentence_0', 'sentence_1']

# init model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('maidalun1020/bce-embedding-base_v1')
model = AutoModel.from_pretrained('maidalun1020/bce-embedding-base_v1')

device = 'cpu'  # if no GPU, set "cpu"

# get inputs
inputs = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")
inputs_on_device = {k: for k, v in inputs.items()}

# get embeddings
outputs = model(**inputs_on_device, return_dict=True)
embeddings = outputs.last_hidden_state[:, 0]  # cls pooler
embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # normalize

我从hf上将模型下载到本地,运行 embedding 的时候,遇到 error 如下:

AttributeError: 'SequenceClassifierOutput' object has no attribute 'last_hidden_state'



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at bce-embedding-base_v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


想咨询下 对于同样的query和similar_questions,使用QAnything里面的rerank_server和BCEmbedding下 RerankerModel得到的排序结果不一样,是模型不一样吗



BCEembedding max_length








下载 tokenizer 时不能指定 auth token

models/ 和 models/ 的__init__里面都有如下代码:

self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) # 这里没有 **kwargs
self.model = AutoModel.from_pretrained(model_name_or_path, **kwargs)

由于下载tokenizer文件时没有传**kwargs,所以指定的 use_auth_token 没有传递进去,下载报错。

Cannot access gated repo for url
Access to model maidalun1020/bce-embedding-base_v1 is restricted. You must be authenticated to access it.

