Pass kwargs to BM25

svilupp · web-flow · commit f9fb20af8b6a · 2025-03-06T10:26:02.000Z
Pass `find_similar` kwargs to `bm25` function
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@
 
 .DS_Store
 .vscode/
+
+# Scratch files
+_*.jl
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+## [0.2.1]
+
+### Fixed
+- Fixed `find_closest` to pass kwargs to `bm25` to allow for normalization of scores
+- Fixed a bug in `ChunkEmbeddingsIndex` where users couldn't create a bitpacked index with `embeddings` of type `BitMatrix` (to use `finder=BitPackedCosineSimilarity()`)
+
 ## [0.2.0]
 
 ### Added
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "RAGTools"
 uuid = "16ddad29-bbe8-45a7-857d-3d9514eb0023"
 authors = ["J S <49557684+svilupp@users.noreply.github.com> and contributors"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
diff --git a/src/bm25.jl b/src/bm25.jl
@@ -103,7 +103,7 @@ end
 bm25(
     dtm::AbstractDocumentTermMatrix, query::AbstractVector{<:AbstractString};
     k1::Float32 = 1.2f0, b::Float32 = 0.75f0, normalize::Bool = false, normalize_max_tf::Real = 3,
-    normalize_min_doc_rel_length::Float32 = 1.0f0)
+    normalize_min_doc_rel_length::Float32 = 1.0f0, kwargs...)
 
 Scores all documents in `dtm` based on the `query`.
 
@@ -120,6 +120,7 @@ Theoretically, if you choose `normalize_max_tf` and `normalize_min_doc_rel_lengt
 - `normalize_min_doc_rel_length`: The minimum document relative length to normalize to. 0.5 is a good default.
 Ideally, pick the minimum document relative length of the corpus that is non-zero
 `min_doc_rel_length = minimum(x for x in doc_rel_length(chunkdata(key_index)) if x > 0) |> Float32`
+
 # Example
 ```
 documents = [["this", "is", "a", "test"], ["this", "is", "another", "test"], ["foo", "bar", "baz"]]
@@ -132,7 +133,6 @@ scores = bm25(dtm, query)
 Normalization is done by dividing the score by the maximum possible score (given some assumptions).
 It's useful to be get results in the same range as cosine similarity scores and when comparing different queries or documents.
 
-# Example
 ```
 documents = [["this", "is", "a", "test"], ["this", "is", "another", "test"], ["foo", "bar", "baz"]]
 dtm = document_term_matrix(documents)
@@ -149,7 +149,7 @@ scores_norm = bm25(dtm, query; normalize = true, normalize_max_tf, normalize_min
 function bm25(
         dtm::AbstractDocumentTermMatrix, query::AbstractVector{<:AbstractString};
         k1::Float32 = 1.2f0, b::Float32 = 0.75f0, normalize::Bool = false, normalize_max_tf::Real = 3,
-        normalize_min_doc_rel_length::Float32 = 0.5f0)
+        normalize_min_doc_rel_length::Float32 = 0.5f0, kwargs...)
     @assert normalize_max_tf>0 "normalize_max_tf term frequency must be positive (got $normalize_max_tf)"
     @assert normalize_min_doc_rel_length>0 "normalize_min_doc_rel_length must be positive (got $normalize_min_doc_rel_length)"
 
diff --git a/src/retrieval.jl b/src/retrieval.jl
@@ -65,8 +65,23 @@ Finds the closest chunks to a query embedding by measuring the BM25 similarity b
 
 Reference: [Wikipedia: BM25](https://en.wikipedia.org/wiki/Okapi_BM25).
 Implementation follows: [The Next Generation of Lucene Relevance](https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/).
-"""
-struct BM25Similarity <: AbstractSimilarityFinder end
+
+Fields mimic the arguments of `bm25`.
+
+# Fields
+- `k1`: The k1 parameter for BM25. Default is 1.2.
+- `b`: The b parameter for BM25. Default is 0.75.
+- `normalize`: Whether to normalize the scores. Default is false.
+- `normalize_max_tf`: The maximum term frequency to normalize to. Default is 3.
+- `normalize_min_doc_rel_length`: The minimum document relative length to normalize to. Default is 1.0.
+"""
+@kwdef struct BM25Similarity <: AbstractSimilarityFinder
+    k1::Float32 = 1.2f0
+    b::Float32 = 0.75f0
+    normalize::Bool = false
+    normalize_max_tf::Real = 3
+    normalize_min_doc_rel_length::Float32 = 1.0f0
+end
 
 """
     MultiFinder <: AbstractSimilarityFinder 
@@ -452,7 +467,6 @@ function find_closest(
     return positions[new_positions], scores
 end
 
-function max_bm25_score end
 """
     find_closest(
         finder::BM25Similarity, dtm::AbstractDocumentTermMatrix,
@@ -468,7 +482,9 @@ function find_closest(
         finder::BM25Similarity, dtm::AbstractDocumentTermMatrix,
         query_emb::AbstractVector{<:Real}, query_tokens::AbstractVector{<:AbstractString} = String[];
         top_k::Int = 100, minimum_similarity::AbstractFloat = -1.0, kwargs...)
-    scores = bm25(dtm, query_tokens)
+    ## unroll finder kwargs, but let it be overwritten by kwargs if provided
+    finder_kwargs = [f => getfield(finder, f) for f in fieldnames(BM25Similarity)]
+    scores = bm25(dtm, query_tokens; finder_kwargs..., kwargs...)
     top_k_min = min(top_k, length(scores))
     ## Take the top_k largest because higher is better in BM25
     ## BM25 score are non-negative but unbounded (grows with number of keywords)
diff --git a/src/types.jl b/src/types.jl
@@ -110,7 +110,7 @@ Previously, this struct was called `ChunkIndex`.
 """
 @kwdef struct ChunkEmbeddingsIndex{
     T1 <: AbstractString,
-    T2 <: Union{Nothing, Matrix{<:Real}},
+    T2 <: Union{Nothing, AbstractMatrix{<:Real}},
     T3 <: Union{Nothing, AbstractMatrix{<:Bool}},
     T4 <: Union{Nothing, AbstractVector}
 } <: AbstractChunkIndex