Add ReaderUtil#partitionByLeaf to partition global docIDs by leaf reader (#15803)

zihanx · web-flow · commit 1f0bc367b7f2 · 2026-03-24T15:15:34.000-07:00
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -223,6 +223,9 @@ API Changes
 
 * GITHUB#15751: Prevent zero vectors in knn fields with cosine similarity configured (Vigya Sharma)
 
+* GITHUB#15803: Add ReaderUtil#partitionByLeaf to partition doc IDs from
+  ScoreDoc hits by leaf reader. (Zihan Xu)
+
 New Features
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java b/lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java
@@ -16,7 +16,9 @@
  */
 package org.apache.lucene.index;
 
+import java.util.Arrays;
 import java.util.List;
+import org.apache.lucene.search.ScoreDoc;
 
 /**
  * Common util methods for dealing with {@link IndexReader}s and {@link IndexReaderContext}s.
@@ -25,6 +27,8 @@
  */
 public final class ReaderUtil {
 
+  private static final int[] EMPTY_INT_ARRAY = new int[0];
+
   private ReaderUtil() {} // no instance
 
   /**
@@ -89,4 +93,54 @@ public static int subIndex(int n, List<LeafReaderContext> leaves) {
     }
     return hi;
   }
+
+  /**
+   * Partitions global doc IDs from ScoreDoc array by leaf. Extracts doc IDs, sorts them, and
+   * partitions across leaves.
+   *
+   * @param hits the ScoreDoc array (typically from TopDocs.scoreDocs)
+   * @param leaves the index reader's leaves
+   * @return array indexed by leaf ord, containing global doc IDs for that leaf (empty if no hits)
+   */
+  public static int[][] partitionByLeaf(ScoreDoc[] hits, List<LeafReaderContext> leaves) {
+    int numLeaves = leaves.size();
+    int[][] result = new int[numLeaves][];
+    if (hits.length == 0) {
+      Arrays.fill(result, EMPTY_INT_ARRAY);
+      return result;
+    }
+    int[] sortedDocIds = new int[hits.length];
+    for (int i = 0; i < hits.length; i++) {
+      sortedDocIds[i] = hits[i].doc;
+    }
+    Arrays.sort(sortedDocIds);
+    int leafStart = 0;
+    int leafIdx = 0;
+    LeafReaderContext leaf = leaves.getFirst();
+    int leafEnd = leaf.docBase + leaf.reader().maxDoc();
+    for (int i = 0; i < sortedDocIds.length; i++) {
+      int docId = sortedDocIds[i];
+      while (docId >= leafEnd) {
+        int count = i - leafStart;
+        if (count == 0) {
+          result[leafIdx] = EMPTY_INT_ARRAY;
+        } else {
+          result[leafIdx] = new int[count];
+          System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+        }
+        leafStart = i;
+        leafIdx++;
+        leaf = leaves.get(leafIdx);
+        leafEnd = leaf.docBase + leaf.reader().maxDoc();
+      }
+    }
+    // Handle remaining docIDs
+    int count = sortedDocIds.length - leafStart;
+    assert count > 0;
+    result[leafIdx] = new int[count];
+    System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
+    // Fill remaining empty leaves
+    Arrays.fill(result, leafIdx + 1, numLeaves, EMPTY_INT_ARRAY);
+    return result;
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestReaderUtil.java b/lucene/core/src/test/org/apache/lucene/index/TestReaderUtil.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.tests.util.LuceneTestCase;
+
+public class TestReaderUtil extends LuceneTestCase {
+
+  public void testPartitionByLeafEmptyDocIds() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
+      writer.addDocument(new Document());
+      writer.addDocument(new Document());
+      try (DirectoryReader reader = DirectoryReader.open(writer)) {
+        List<LeafReaderContext> leaves = reader.leaves();
+        int[][] result = ReaderUtil.partitionByLeaf(new ScoreDoc[0], leaves);
+        assertEquals(leaves.size(), result.length);
+        for (int[] leaf : result) {
+          assertEquals(0, leaf.length);
+        }
+      }
+    }
+  }
+
+  public void testPartitionByLeafSingleSegment() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig())) {
+      for (int i = 0; i < 10; i++) {
+        writer.addDocument(new Document());
+      }
+      try (DirectoryReader reader = DirectoryReader.open(writer)) {
+        List<LeafReaderContext> leaves = reader.leaves();
+        assertEquals(1, leaves.size());
+
+        ScoreDoc[] hits = {
+          new ScoreDoc(0, 1f), new ScoreDoc(3, 1f), new ScoreDoc(5, 1f), new ScoreDoc(9, 1f)
+        };
+        int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
+
+        assertEquals(1, result.length);
+        assertArrayEquals(new int[] {0, 3, 5, 9}, result[0]);
+      }
+    }
+  }
+
+  public void testPartitionByLeafMultipleSegments() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter writer =
+            new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
+      for (int i = 0; i < 10; i++) {
+        writer.addDocument(new Document());
+      }
+      writer.commit();
+
+      // Create second segment
+      for (int i = 0; i < 10; i++) {
+        writer.addDocument(new Document());
+      }
+      writer.commit();
+
+      try (DirectoryReader reader = DirectoryReader.open(writer)) {
+        List<LeafReaderContext> leaves = reader.leaves();
+        assertEquals(2, leaves.size());
+
+        // Hits in both segments
+        ScoreDoc[] hits = {
+          new ScoreDoc(2, 1f), new ScoreDoc(9, 1f), new ScoreDoc(10, 1f), new ScoreDoc(18, 1f)
+        };
+        int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
+
+        assertEquals(2, result.length);
+        // First segment: docs 0-9
+        assertArrayEquals(new int[] {2, 9}, result[0]);
+        // Second segment: docs 10-19
+        assertArrayEquals(new int[] {10, 18}, result[1]);
+      }
+    }
+  }
+
+  public void testPartitionByLeafSkipsSegmentsWithNoHits() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter writer =
+            new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
+      // Create 3 segments
+      for (int seg = 0; seg < 3; seg++) {
+        for (int i = 0; i < 10; i++) {
+          writer.addDocument(new Document());
+        }
+        writer.commit();
+      }
+
+      try (DirectoryReader reader = DirectoryReader.open(writer)) {
+        List<LeafReaderContext> leaves = reader.leaves();
+        assertEquals(3, leaves.size());
+
+        // Hits only in first and third segment (skip middle)
+        ScoreDoc[] hits = {new ScoreDoc(3, 1f), new ScoreDoc(25, 1f)};
+        int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
+
+        assertEquals(3, result.length);
+        assertArrayEquals(new int[] {3}, result[0]);
+        assertEquals(0, result[1].length); // middle segment has no hits
+        assertArrayEquals(new int[] {25}, result[2]);
+      }
+    }
+  }
+
+  public void testPartitionByLeafRandomized() throws IOException {
+    for (int iter = 0; iter < 100; iter++) {
+      int numSegments = random().nextInt(10) + 1;
+      int totalDocs = 0;
+      int[] docsPerSegment = new int[numSegments];
+      for (int i = 0; i < numSegments; i++) {
+        docsPerSegment[i] = random().nextInt(100) + 1;
+        totalDocs += docsPerSegment[i];
+      }
+
+      try (Directory dir = newDirectory();
+          IndexWriter writer =
+              new IndexWriter(
+                  dir, new IndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE))) {
+        for (int seg = 0; seg < numSegments; seg++) {
+          for (int i = 0; i < docsPerSegment[seg]; i++) {
+            writer.addDocument(new Document());
+          }
+          writer.commit();
+        }
+
+        try (DirectoryReader reader = DirectoryReader.open(writer)) {
+          List<LeafReaderContext> leaves = reader.leaves();
+          assertEquals(numSegments, leaves.size());
+
+          // Generate random hits (0 to totalDocs inclusive - covers empty and all-match)
+          int numHits = random().nextInt(totalDocs + 1);
+          Set<Integer> hitSet = new HashSet<>();
+          while (hitSet.size() < numHits) {
+            hitSet.add(random().nextInt(totalDocs));
+          }
+          int[] docIds = hitSet.stream().mapToInt(Integer::intValue).toArray();
+          ScoreDoc[] hits = new ScoreDoc[docIds.length];
+          for (int i = 0; i < docIds.length; i++) {
+            hits[i] = new ScoreDoc(docIds[i], 1f);
+          }
+
+          int[][] result = ReaderUtil.partitionByLeaf(hits, leaves);
+
+          // Verify: result length matches leaves
+          assertEquals(numSegments, result.length);
+
+          // Verify: total hits preserved
+          int totalResultDocs = Arrays.stream(result).mapToInt(a -> a.length).sum();
+          assertEquals(docIds.length, totalResultDocs);
+
+          // Verify: each doc in correct leaf and sorted
+          for (int leafIdx = 0; leafIdx < result.length; leafIdx++) {
+            int[] leafDocs = result[leafIdx];
+            LeafReaderContext leaf = leaves.get(leafIdx);
+            int docBase = leaf.docBase;
+            int maxDoc = leaf.reader().maxDoc();
+            for (int i = 0; i < leafDocs.length; i++) {
+              int docId = leafDocs[i];
+              assertTrue(docId >= docBase && docId < docBase + maxDoc);
+              if (i > 0) {
+                assertTrue(leafDocs[i] > leafDocs[i - 1]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}