Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cosine_similarity to hn_mine #1179

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ python hn_mine.py \
--input_file toy_finetune_data.jsonl \
--output_file toy_finetune_data_minedHN.jsonl \
--range_for_sampling 2-200 \
--similarity_range 0.3-0.8 \
--negative_number 15 \
--use_gpu_for_searching
```
Expand All @@ -29,6 +30,7 @@ python hn_mine.py \
- **`output_file`**: path to save JSON data with mined hard negatives for finetuning
- **`negative_number`**: the number of sampled negatives
- **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)**
- **`similarity_range`**: Specifies the similarity score range for sampling negatives. This defines the range of similarity between the query and the negative samples. For example, "0.3-0.8" will only sample negatives with similarity scores between 0.3 and 0.8, allowing control over the difficulty of the negatives based on their relevance to the query. (e.g., setting it to "0.1-0.9" to sample negatives with similarity scores from 0.1 to 0.9), you can reduce the difficulty by including more diverse and less relevant negatives, whereas narrowing the range (e.g., "0.6-0.8") increases difficulty by focusing on more relevant negatives.
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file.
- **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives.

Expand Down
26 changes: 17 additions & 9 deletions scripts/hn_mine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def get_args():
parser.add_argument('--candidate_pool', default=None, type=str)
parser.add_argument('--output_file', default=None, type=str)
parser.add_argument('--range_for_sampling', default="10-210", type=str, help="range to sample negatives")
parser.add_argument('--similarity_range', default="0.0-1.0", type=str, help="similarity range to sample negatives")
parser.add_argument('--use_gpu_for_searching', action='store_true', help='use faiss-gpu')
parser.add_argument('--negative_number', default=15, type=int, help='the number of negatives')
parser.add_argument('--query_instruction_for_retrieval', default="")
Expand Down Expand Up @@ -55,7 +56,7 @@ def get_corpus(candidate_pool):
return corpus


def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, negative_number, use_gpu):
def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, similarity_range, negative_number, use_gpu):
corpus = []
queries = []
train_data = []
Expand All @@ -81,16 +82,21 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n

print('create index and search------------------')
index = create_index(p_vecs, use_gpu=use_gpu)
_, all_inxs = batch_search(index, q_vecs, topk=sample_range[-1])
all_scores, all_inxs = batch_search(index, q_vecs, topk=sample_range[-1])
assert len(all_inxs) == len(train_data)

min_sim, max_sim = similarity_range
for i, data in enumerate(train_data):
query = data['query']
inxs = all_inxs[i][sample_range[0]:sample_range[1]]
scores = all_scores[i]
inxs = all_inxs[i]

inxs = inxs[sample_range[0]:sample_range[1]]
scores = scores[sample_range[0]:sample_range[1]]

filtered_inx = []
for inx in inxs:
if inx == -1: break
if corpus[inx] not in data['pos'] and corpus[inx] != query:
for score, inx in zip(scores, inxs):
if min_sim <= score <= max_sim and corpus[inx] not in data['pos'] and corpus[inx] != query:
filtered_inx.append(inx)

if len(filtered_inx) > negative_number:
Expand All @@ -108,8 +114,9 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n

if __name__ == '__main__':
args = get_args()
sample_range = args.range_for_sampling.split('-')
sample_range = [int(x) for x in sample_range]

sample_range = list(map(int, args.range_for_sampling.split('-')))
similarity_range = list(map(float, args.similarity_range.split('-')))

model = FlagModel(args.model_name_or_path, query_instruction_for_retrieval=args.query_instruction_for_retrieval)

Expand All @@ -118,5 +125,6 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n
candidate_pool=args.candidate_pool,
output_file=args.output_file,
sample_range=sample_range,
similarity_range=similarity_range,
negative_number=args.negative_number,
use_gpu=args.use_gpu_for_searching)
use_gpu=args.use_gpu_for_searching)