apache
diff --git a/‎pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/CodecPipelineIntegrationTest.java‎
Lines changed: 270 additions & 0 deletions b/‎pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/custom/CodecPipelineIntegrationTest.java‎
Lines changed: 270 additions & 0 deletions
diff --git a/‎pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/CodecBufferUtils.java‎
Lines changed: 47 additions & 0 deletions b/‎pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/codec/CodecBufferUtils.java‎
Lines changed: 47 additions & 0 deletions
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.integration.tests.custom;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import javax.annotation.Nullable;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.pinot.spi.config.table.FieldConfig;
+import org.apache.pinot.spi.config.table.TableConfig;
+import org.apache.pinot.spi.config.table.TableType;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.Schema;
+import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+
+
+/**
+ * Integration test for the codec pipeline forward index (version 7).
+ * Writes an offline table with INT and LONG raw columns encoded via the CODEC(DELTA,ZSTD(3)) pipeline,
+ * plus a STRING column stored with dictionary encoding, verifying both encoding paths coexist in
+ * the same segment. Exercises the full segment build → query path.
+ */
+@Test(suiteName = "CustomClusterIntegrationTest")
+public class CodecPipelineIntegrationTest extends CustomDataQueryClusterIntegrationTest {
+
+  private static final String TABLE_NAME = "CodecPipelineIntegrationTest";
+  private static final int NUM_DOCS = 1000;
+
+  private static final String INT_COL = "intVal";
+  private static final String LONG_COL = "longVal";
+  private static final String STR_COL = "strVal";
+  private static final String TIME_COL = "ts";
+
+  // Predictable values: intVal[i] = i, longVal[i] = i * 1_000_000_000L
+  // Sum(0..999) = 499_500
+  private static final long EXPECTED_INT_SUM = 499_500L;
+  // Sum(0..999) * 1_000_000_000 = 499_500 * 1_000_000_000 = 499_500_000_000_000L
+  private static final long EXPECTED_LONG_SUM = 499_500L * 1_000_000_000L;
+
+  @Override
+  public String getTableName() {
+    return TABLE_NAME;
+  }
+
+  @Override
+  public Schema createSchema() {
+    return new Schema.SchemaBuilder().setSchemaName(getTableName())
+        .addMetric(INT_COL, FieldSpec.DataType.INT)
+        .addMetric(LONG_COL, FieldSpec.DataType.LONG)
+        .addSingleValueDimension(STR_COL, FieldSpec.DataType.STRING)
+        .addDateTimeField(TIME_COL, FieldSpec.DataType.LONG, "1:MILLISECONDS:EPOCH", "1:MILLISECONDS")
+        .build();
+  }
+
+  @Override
+  public List<File> createAvroFiles()
+      throws IOException {
+    org.apache.avro.Schema avroSchema = org.apache.avro.Schema.createRecord("codecRecord", null, null, false);
+    avroSchema.setFields(List.of(
+        new org.apache.avro.Schema.Field(INT_COL, org.apache.avro.Schema.create(org.apache.avro.Schema.Type.INT),
+            null, null),
+        new org.apache.avro.Schema.Field(LONG_COL, org.apache.avro.Schema.create(org.apache.avro.Schema.Type.LONG),
+            null, null),
+        new org.apache.avro.Schema.Field(STR_COL, org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING),
+            null, null),
+        new org.apache.avro.Schema.Field(TIME_COL, org.apache.avro.Schema.create(org.apache.avro.Schema.Type.LONG),
+            null, null)));
+
+    try (AvroFilesAndWriters avroFilesAndWriters = createAvroFilesAndWriters(avroSchema)) {
+      List<DataFileWriter<GenericData.Record>> writers = avroFilesAndWriters.getWriters();
+      for (int i = 0; i < NUM_DOCS; i++) {
+        GenericData.Record record = new GenericData.Record(avroSchema);
+        record.put(INT_COL, i);
+        record.put(LONG_COL, (long) i * 1_000_000_000L);
+        record.put(STR_COL, "str_" + i);
+        record.put(TIME_COL, (long) i);
+        writers.get(i % getNumAvroFiles()).append(record);
+      }
+      return avroFilesAndWriters.getAvroFiles();
+    }
+  }
+
+  @Override
+  public String getTimeColumnName() {
+    return TIME_COL;
+  }
+
+  @Override
+  protected long getCountStarResult() {
+    return NUM_DOCS;
+  }
+
+  @Override
+  public TableConfig createOfflineTableConfig() {
+    return new TableConfigBuilder(TableType.OFFLINE).setTableName(getTableName())
+        .setNoDictionaryColumns(getNoDictionaryColumns())
+        .setFieldConfigList(getFieldConfigs())
+        .build();
+  }
+
+  @Override
+  protected List<String> getNoDictionaryColumns() {
+    // STR_COL uses a dictionary (default), so it is intentionally NOT in this list
+    return List.of(INT_COL, LONG_COL);
+  }
+
+  @Override
+  protected List<FieldConfig> getFieldConfigs() {
+    List<FieldConfig> fieldConfigs = new ArrayList<>();
+    // INT column with full DELTA+ZSTD pipeline (codec pipeline, raw, no dict)
+    fieldConfigs.add(new FieldConfig.Builder(INT_COL)
+        .withEncodingType(FieldConfig.EncodingType.RAW)
+        .withCodecSpec("CODEC(DELTA,ZSTD(3))")
+        .build());
+    // LONG column with ZSTD-only (codec pipeline, raw, no dict)
+    fieldConfigs.add(new FieldConfig.Builder(LONG_COL)
+        .withEncodingType(FieldConfig.EncodingType.RAW)
+        .withCodecSpec("ZSTD(3)")
+        .build());
+    // STR_COL with dictionary encoding — verifies codec-pipeline and dict columns coexist in the same segment
+    fieldConfigs.add(new FieldConfig.Builder(STR_COL)
+        .withEncodingType(FieldConfig.EncodingType.DICTIONARY)
+        .build());
+    return fieldConfigs;
+  }
+
+  @Nullable
+  @Override
+  protected String getSortedColumn() {
+    return null;
+  }
+
+  @Nullable
+  @Override
+  protected List<String> getInvertedIndexColumns() {
+    return null;
+  }
+
+  @Nullable
+  @Override
+  protected List<String> getRangeIndexColumns() {
+    return null;
+  }
+
+  @Nullable
+  @Override
+  protected List<String> getBloomFilterColumns() {
+    return null;
+  }
+
+  @Test(dataProvider = "useBothQueryEngines")
+  public void testSumQueries(boolean useMultiStageQueryEngine)
+      throws Exception {
+    setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+
+    // Verify COUNT(*)
+    String countQuery = "SELECT COUNT(*) FROM " + getTableName();
+    JsonNode countResult = postQuery(countQuery);
+    assertEquals(countResult.get("resultTable").get("rows").get(0).get(0).asLong(), NUM_DOCS,
+        "Unexpected row count");
+
+    // Verify SUM(intVal) — exercises delta-decoded INT reads
+    String intSumQuery = "SELECT SUM(intVal) FROM " + getTableName();
+    JsonNode intSumResult = postQuery(intSumQuery);
+    assertEquals(intSumResult.get("resultTable").get("rows").get(0).get(0).asLong(), EXPECTED_INT_SUM,
+        "Unexpected SUM(intVal)");
+
+    // Verify SUM(longVal) — exercises ZSTD-only LONG reads
+    String longSumQuery = "SELECT SUM(longVal) FROM " + getTableName();
+    JsonNode longSumResult = postQuery(longSumQuery);
+    assertEquals(longSumResult.get("resultTable").get("rows").get(0).get(0).asLong(), EXPECTED_LONG_SUM,
+        "Unexpected SUM(longVal)");
+  }
+
+  @Test(dataProvider = "useBothQueryEngines")
+  public void testFilterQueries(boolean useMultiStageQueryEngine)
+      throws Exception {
+    setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+
+    // Filter on INT column — verifies individual value decoding is correct
+    // intVal < 100 → 100 rows (values 0..99)
+    String intFilterQuery = "SELECT COUNT(*) FROM " + getTableName() + " WHERE intVal < 100";
+    JsonNode intFilterResult = postQuery(intFilterQuery);
+    assertEquals(intFilterResult.get("resultTable").get("rows").get(0).get(0).asLong(), 100L,
+        "Unexpected count for intVal < 100");
+
+    // Filter on LONG column — verifies LONG decoding
+    // longVal < 100_000_000_000L → rows with i < 100 → 100 rows
+    String longFilterQuery =
+        "SELECT COUNT(*) FROM " + getTableName() + " WHERE longVal < 100000000000";
+    JsonNode longFilterResult = postQuery(longFilterQuery);
+    assertEquals(longFilterResult.get("resultTable").get("rows").get(0).get(0).asLong(), 100L,
+        "Unexpected count for longVal < 100_000_000_000");
+  }
+
+  /**
+   * Verifies specific individual values via point lookups to detect per-doc decoding errors.
+   * A codec that misapplies delta or reads stale chunks would produce wrong individual values
+   * even if aggregate queries (SUM, COUNT) happen to be correct.
+   */
+  @Test(dataProvider = "useBothQueryEngines")
+  public void testPointLookups(boolean useMultiStageQueryEngine)
+      throws Exception {
+    setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+
+    // Spot-check a few specific docs across different chunks
+    int[] spotCheckIds = {0, 1, 511, 512, 513, 999};
+    for (int id : spotCheckIds) {
+      // intVal = id for row where ts = id (ts is unique and equals the doc index)
+      String intQuery = "SELECT intVal FROM " + getTableName() + " WHERE ts = " + id;
+      JsonNode intResult = postQuery(intQuery);
+      assertEquals(intResult.get("resultTable").get("rows").get(0).get(0).asInt(), id,
+          "Wrong intVal for ts=" + id);
+
+      // longVal = id * 1_000_000_000L
+      String longQuery = "SELECT longVal FROM " + getTableName() + " WHERE ts = " + id;
+      JsonNode longResult = postQuery(longQuery);
+      assertEquals(longResult.get("resultTable").get("rows").get(0).get(0).asLong(),
+          (long) id * 1_000_000_000L, "Wrong longVal for ts=" + id);
+    }
+  }
+
+  /**
+   * Verifies that a STRING column stored with dictionary encoding (not codec pipeline) reads back
+   * correctly alongside codec-pipeline columns, confirming both can coexist in the same segment.
+   */
+  @Test(dataProvider = "useBothQueryEngines")
+  public void testStringColumnWithDictEncoding(boolean useMultiStageQueryEngine)
+      throws Exception {
+    setUseMultiStageQueryEngine(useMultiStageQueryEngine);
+
+    // Spot-check a few string values: strVal = "str_<ts>"
+    int[] spotCheckIds = {0, 42, 500, 999};
+    for (int id : spotCheckIds) {
+      String strQuery = "SELECT strVal FROM " + getTableName() + " WHERE ts = " + id;
+      JsonNode result = postQuery(strQuery);
+      assertEquals(result.get("resultTable").get("rows").get(0).get(0).asText(), "str_" + id,
+          "Wrong strVal for ts=" + id);
+    }
+
+    // Verify COUNT DISTINCT to confirm all unique string values are stored
+    String countDistinctQuery = "SELECT COUNT(DISTINCT strVal) FROM " + getTableName();
+    JsonNode countDistinctResult = postQuery(countDistinctQuery);
+    assertEquals(countDistinctResult.get("resultTable").get("rows").get(0).get(0).asLong(), NUM_DOCS,
+        "Expected all " + NUM_DOCS + " distinct string values");
+  }
+}
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.io.codec;
+
+import java.nio.ByteBuffer;
+
+
+/** Package-private buffer helpers shared across codec handler implementations. */
+final class CodecBufferUtils {
+
+  private CodecBufferUtils() {
+  }
+
+  /** Returns {@code buf} if already direct; otherwise copies into a new direct buffer. */
+  static ByteBuffer toDirectBuffer(ByteBuffer buf) {
+    if (buf.isDirect()) {
+      return buf;
+    }
+    ByteBuffer direct = ByteBuffer.allocateDirect(buf.remaining());
+    direct.put(buf.duplicate());
+    direct.flip();
+    return direct;
+  }
+
+  /** Copies the readable bytes of {@code buf} into a new heap byte array. */
+  static byte[] toHeapArray(ByteBuffer buf) {
+    byte[] bytes = new byte[buf.remaining()];
+    buf.duplicate().get(bytes);
+    return bytes;
+  }
+}