Skip to content

Commit 1f0bc36

Browse files
authored
Add ReaderUtil#partitionByLeaf to partition global docIDs by leaf reader (#15803)
1 parent 03d0ba7 commit 1f0bc36

File tree

3 files changed

+251
-0
lines changed

3 files changed

+251
-0
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,9 @@ API Changes
223223

224224
* GITHUB#15751: Prevent zero vectors in knn fields with cosine similarity configured (Vigya Sharma)
225225

226+
* GITHUB#15803: Add ReaderUtil#partitionByLeaf to partition doc IDs from
227+
ScoreDoc hits by leaf reader. (Zihan Xu)
228+
226229
New Features
227230
---------------------
228231

lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
*/
1717
package org.apache.lucene.index;
1818

19+
import java.util.Arrays;
1920
import java.util.List;
21+
import org.apache.lucene.search.ScoreDoc;
2022

2123
/**
2224
* Common util methods for dealing with {@link IndexReader}s and {@link IndexReaderContext}s.
@@ -25,6 +27,8 @@
2527
*/
2628
public final class ReaderUtil {
2729

30+
private static final int[] EMPTY_INT_ARRAY = new int[0];
31+
2832
private ReaderUtil() {} // no instance
2933

3034
/**
@@ -89,4 +93,54 @@ public static int subIndex(int n, List<LeafReaderContext> leaves) {
8993
}
9094
return hi;
9195
}
96+
97+
/**
98+
* Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, sorts them, and
99+
* partitions across leaves.
100+
*
101+
* @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
102+
* @param leaves the index reader's leaves
103+
* @return array indexed by leaf ord, containing global doc IDs for that leaf (empty if no hits)
104+
*/
105+
public static int[][] partitionByLeaf(ScoreDoc[] hits, List<LeafReaderContext> leaves) {
106+
int numLeaves = leaves.size();
107+
int[][] result = new int[numLeaves][];
108+
if (hits.length == 0) {
109+
Arrays.fill(result, EMPTY_INT_ARRAY);
110+
return result;
111+
}
112+
int[] sortedDocIds = new int[hits.length];
113+
for (int i = 0; i < hits.length; i++) {
114+
sortedDocIds[i] = hits[i].doc;
115+
}
116+
Arrays.sort(sortedDocIds);
117+
int leafStart = 0;
118+
int leafIdx = 0;
119+
LeafReaderContext leaf = leaves.getFirst();
120+
int leafEnd = leaf.docBase + leaf.reader().maxDoc();
121+
for (int i = 0; i < sortedDocIds.length; i++) {
122+
int docId = sortedDocIds[i];
123+
while (docId >= leafEnd) {
124+
int count = i - leafStart;
125+
if (count == 0) {
126+
result[leafIdx] = EMPTY_INT_ARRAY;
127+
} else {
128+
result[leafIdx] = new int[count];
129+
System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
130+
}
131+
leafStart = i;
132+
leafIdx++;
133+
leaf = leaves.get(leafIdx);
134+
leafEnd = leaf.docBase + leaf.reader().maxDoc();
135+
}
136+
}
137+
// Handle remaining docIDs
138+
int count = sortedDocIds.length - leafStart;
139+
assert count > 0;
140+
result[leafIdx] = new int[count];
141+
System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
142+
// Fill remaining empty leaves
143+
Arrays.fill(result, leafIdx + 1, numLeaves, EMPTY_INT_ARRAY);
144+
return result;
145+
}
92146
}
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.index;
18+
19+
import java.io.IOException;
20+
import java.util.Arrays;
21+
import java.util.HashSet;
22+
import java.util.List;
23+
import java.util.Set;
24+
import org.apache.lucene.document.Document;
25+
import org.apache.lucene.search.ScoreDoc;
26+
import org.apache.lucene.store.Directory;
27+
import org.apache.lucene.tests.util.LuceneTestCase;
28+
29+
public class TestReaderUtil extends LuceneTestCase {
30+
31+
public void testPartitionByLeafEmptyDocIds() throws IOException {
32+
try (Directory dir = newDirectory();
33+
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
34+
writer.addDocument(new Document());
35+
writer.addDocument(new Document());
36+
try (DirectoryReader reader = DirectoryReader.open(writer)) {
37+
List<LeafReaderContext> leaves = reader.leaves();
38+
int[][] result = ReaderUtil.partitionByLeaf(new ScoreDoc[0], leaves);
39+
assertEquals(leaves.size(), result.length);
40+
for (int[] leaf : result) {
41+
assertEquals(0, leaf.length);
42+
}
43+
}
44+
}
45+
}
46+
47+
public void testPartitionByLeafSingleSegment() throws IOException {
48+
try (Directory dir = newDirectory();
49+
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
50+
for (int i = 0; i < 10; i++) {
51+
writer.addDocument(new Document());
52+
}
53+
try (DirectoryReader reader = DirectoryReader.open(writer)) {
54+
List<LeafReaderContext> leaves = reader.leaves();
55+
assertEquals(1, leaves.size());
56+
57+
ScoreDoc[] hits = {
58+
new ScoreDoc(0, 1f), new ScoreDoc(3, 1f), new ScoreDoc(5, 1f), new ScoreDoc(9, 1f)
59+
};
60+
int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
61+
62+
assertEquals(1, result.length);
63+
assertArrayEquals(new int[] {0, 3, 5, 9}, result[0]);
64+
}
65+
}
66+
}
67+
68+
public void testPartitionByLeafMultipleSegments() throws IOException {
69+
try (Directory dir = newDirectory();
70+
IndexWriter writer =
71+
new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
72+
for (int i = 0; i < 10; i++) {
73+
writer.addDocument(new Document());
74+
}
75+
writer.commit();
76+
77+
// Create second segment
78+
for (int i = 0; i < 10; i++) {
79+
writer.addDocument(new Document());
80+
}
81+
writer.commit();
82+
83+
try (DirectoryReader reader = DirectoryReader.open(writer)) {
84+
List<LeafReaderContext> leaves = reader.leaves();
85+
assertEquals(2, leaves.size());
86+
87+
// Hits in both segments
88+
ScoreDoc[] hits = {
89+
new ScoreDoc(2, 1f), new ScoreDoc(9, 1f), new ScoreDoc(10, 1f), new ScoreDoc(18, 1f)
90+
};
91+
int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
92+
93+
assertEquals(2, result.length);
94+
// First segment: docs 0-9
95+
assertArrayEquals(new int[] {2, 9}, result[0]);
96+
// Second segment: docs 10-19
97+
assertArrayEquals(new int[] {10, 18}, result[1]);
98+
}
99+
}
100+
}
101+
102+
public void testPartitionByLeafSkipsSegmentsWithNoHits() throws IOException {
103+
try (Directory dir = newDirectory();
104+
IndexWriter writer =
105+
new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
106+
// Create 3 segments
107+
for (int seg = 0; seg < 3; seg++) {
108+
for (int i = 0; i < 10; i++) {
109+
writer.addDocument(new Document());
110+
}
111+
writer.commit();
112+
}
113+
114+
try (DirectoryReader reader = DirectoryReader.open(writer)) {
115+
List<LeafReaderContext> leaves = reader.leaves();
116+
assertEquals(3, leaves.size());
117+
118+
// Hits only in first and third segment (skip middle)
119+
ScoreDoc[] hits = {new ScoreDoc(3, 1f), new ScoreDoc(25, 1f)};
120+
int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
121+
122+
assertEquals(3, result.length);
123+
assertArrayEquals(new int[] {3}, result[0]);
124+
assertEquals(0, result[1].length); // middle segment has no hits
125+
assertArrayEquals(new int[] {25}, result[2]);
126+
}
127+
}
128+
}
129+
130+
public void testPartitionByLeafRandomized() throws IOException {
131+
for (int iter = 0; iter < 100; iter++) {
132+
int numSegments = random().nextInt(10) + 1;
133+
int totalDocs = 0;
134+
int[] docsPerSegment = new int[numSegments];
135+
for (int i = 0; i < numSegments; i++) {
136+
docsPerSegment[i] = random().nextInt(100) + 1;
137+
totalDocs += docsPerSegment[i];
138+
}
139+
140+
try (Directory dir = newDirectory();
141+
IndexWriter writer =
142+
new IndexWriter(
143+
dir, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
144+
for (int seg = 0; seg < numSegments; seg++) {
145+
for (int i = 0; i < docsPerSegment[seg]; i++) {
146+
writer.addDocument(new Document());
147+
}
148+
writer.commit();
149+
}
150+
151+
try (DirectoryReader reader = DirectoryReader.open(writer)) {
152+
List<LeafReaderContext> leaves = reader.leaves();
153+
assertEquals(numSegments, leaves.size());
154+
155+
// Generate random hits (0 to totalDocs inclusive - covers empty and all-match)
156+
int numHits = random().nextInt(totalDocs + 1);
157+
Set<Integer> hitSet = new HashSet<>();
158+
while (hitSet.size() < numHits) {
159+
hitSet.add(random().nextInt(totalDocs));
160+
}
161+
int[] docIds = hitSet.stream().mapToInt(Integer::intValue).toArray();
162+
ScoreDoc[] hits = new ScoreDoc[docIds.length];
163+
for (int i = 0; i < docIds.length; i++) {
164+
hits[i] = new ScoreDoc(docIds[i], 1f);
165+
}
166+
167+
int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
168+
169+
// Verify: result length matches leaves
170+
assertEquals(numSegments, result.length);
171+
172+
// Verify: total hits preserved
173+
int totalResultDocs = Arrays.stream(result).mapToInt(a -> a.length).sum();
174+
assertEquals(docIds.length, totalResultDocs);
175+
176+
// Verify: each doc in correct leaf and sorted
177+
for (int leafIdx = 0; leafIdx < result.length; leafIdx++) {
178+
int[] leafDocs = result[leafIdx];
179+
LeafReaderContext leaf = leaves.get(leafIdx);
180+
int docBase = leaf.docBase;
181+
int maxDoc = leaf.reader().maxDoc();
182+
for (int i = 0; i < leafDocs.length; i++) {
183+
int docId = leafDocs[i];
184+
assertTrue(docId >= docBase && docId < docBase + maxDoc);
185+
if (i > 0) {
186+
assertTrue(leafDocs[i] > leafDocs[i - 1]);
187+
}
188+
}
189+
}
190+
}
191+
}
192+
}
193+
}
194+
}

0 commit comments

Comments
 (0)