Improve TruncateTokenFilter to truncate on codepoints (and no longer produce half surrogates) or legacy utf-16 chars (#15900)

msokolov · uschindler · web-flow · commit 86ba05a59825 · 2026-04-06T08:36:10.000-04:00
Fix TruncateTokenFilter to count and trim on codepoints not chars. This fixes #15899 Co-authored-by: Uwe Schindler <uschindler@apache.org>
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -329,6 +329,10 @@ Bug Fixes
   RefCountedSharedArena.DEFAULT_MAX_PERMITS instead of a hardcoded value, so that
   the default change from GITHUB#15078 takes effect. (Huaixinww)
 
+* GITHUB#15899, GITHUB#15900: Improve TruncateTokenFilter to truncate on codepoints not
+  chars and no longer produce half surrogates. There are new factory parameters available
+  to configure legacy prefix chars and new codepoint behaviour.  (Uwe Schindler, Michael Sokolov)
+
 Other
 ---------------------
 * GITHUB#15586: Document that scoring and ranking may change across major Lucene versions, and that applications requiring stable ranking should explicitly configure Similarity. (Parveen Saini)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java
@@ -16,41 +16,101 @@
  */
 package org.apache.lucene.analysis.miscellaneous;
 
+import static java.lang.Character.isHighSurrogate;
+import static java.lang.Character.isLowSurrogate;
+
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 
 /**
- * A token filter for truncating the terms into a specific length. Fixed prefix truncation, as a
- * stemming method, produces good results on Turkish language. It is reported that F5, using first 5
- * characters, produced best results in <a
- * href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">Information Retrieval on
- * Turkish Texts</a>
+ * A token filter for truncating the terms into a specific length (number of codepoints). Fixed
+ * prefix truncation, as a stemming method, produces good results on Turkish language. It is
+ * reported that F5, using first 5 characters, produced best results in <a
+ * href="https://doi.org/10.1002/asi.20750">Information Retrieval on Turkish Texts</a>
+ *
+ * <p>Since Lucene 10.5, the filter is able to correctly handle codepoints and truncates after the
+ * given number of codepoints, no longer producing incomplete surrogate pairs. Use the modern
+ * factory method {@link #truncateAfterCodePoints(TokenStream, int)} to enable this mode. Legacy
+ * behaviour is still available with {@link #truncateAfterChars(TokenStream, int)}
  */
 public final class TruncateTokenFilter extends TokenFilter {
 
   private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
   private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
 
-  private final int length;
+  private final int truncateAfter;
+  private final boolean useCodePoints;
+
+  /** Returns a filter with a prefix of {@code nCodePoints}. */
+  public static TruncateTokenFilter truncateAfterCodePoints(TokenStream input, int nCodePoints) {
+    return new TruncateTokenFilter(input, nCodePoints, true);
+  }
+
+  /**
+   * Returns a filter with a prefix of {@code nChars} Java Characters. This may split surrogate
+   * pairs.
+   */
+  public static TruncateTokenFilter truncateAfterChars(TokenStream input, int nChars) {
+    return new TruncateTokenFilter(input, nChars, false);
+  }
+
+  /**
+   * Instantiates filter with a prefix of {@code nChars} Java Characters. This may split surrogate
+   * pairs.
+   *
+   * @deprecated This constructor is deprecated, use {@link #truncateAfterChars(TokenStream, int)}
+   *     for backwards compatibility, or {@link #truncateAfterCodePoints(TokenStream, int)} to be
+   *     unicode conformant.
+   */
+  @Deprecated
+  public TruncateTokenFilter(TokenStream input, int nChars) {
+    this(input, nChars, false);
+  }
 
-  public TruncateTokenFilter(TokenStream input, int length) {
+  private TruncateTokenFilter(TokenStream input, int truncateAfter, boolean useCodePoints) {
     super(input);
-    if (length < 1)
-      throw new IllegalArgumentException("length parameter must be a positive number: " + length);
-    this.length = length;
+    if (truncateAfter < 1) {
+      throw new IllegalArgumentException(
+          "truncateAfter parameter must be a positive number: " + truncateAfter);
+    }
+    this.truncateAfter = truncateAfter;
+    this.useCodePoints = useCodePoints;
   }
 
   @Override
   public final boolean incrementToken() throws IOException {
-    if (input.incrementToken()) {
-      if (!keywordAttr.isKeyword() && termAttribute.length() > length)
-        termAttribute.setLength(length);
+    if (!input.incrementToken()) {
+      return false;
+    }
+    if (keywordAttr.isKeyword()) {
       return true;
+    }
+    final int len = termAttribute.length();
+    if (len <= truncateAfter) {
+      // the term is short enough, so we do not need to modify it
+      // (works for both chars and codepoints)
+      return true;
+    }
+    if (useCodePoints) {
+      // code based on ICU4J's com.ibm.icu.text.UTF16#findOffsetFromCodePoint(...) implementation:
+      final char[] arr = termAttribute.buffer();
+      int ofs = 0, remaining = truncateAfter;
+      while (ofs < len && remaining > 0) {
+        if (isHighSurrogate(arr[ofs++]) && ofs < len && isLowSurrogate(arr[ofs])) {
+          ofs++;
+        }
+        remaining--;
+      }
+      // check if we actually reached the limit and set new length based on calculated offset:
+      if (remaining == 0) {
+        termAttribute.setLength(ofs);
+      }
     } else {
-      return false;
+      termAttribute.setLength(truncateAfter);
     }
+    return true;
   }
 }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java
@@ -17,12 +17,26 @@
 package org.apache.lucene.analysis.miscellaneous;
 
 import java.util.Map;
+import java.util.function.BiFunction;
 import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
 
 /**
- * Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following
- * type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
+ * Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}.
+ *
+ * <p>Fixed prefix truncation, as a stemming method, produces good results on Turkish language. It
+ * is reported that F5, using first 5 characters, produced best results in <a
+ * href="https://doi.org/10.1002/asi.20750">Information Retrieval on Turkish Texts</a>
+ *
+ * <p>Since Lucene 10.5, the filter correctly handles codepoints and truncates after {@code
+ * truncateAfterCodePoints} codepoints, no longer producing incomplete surrogate pairs. For
+ * backwards compatibility the old {@code prefixLength} is still supported and its behaviour depends
+ * on the {@code luceneMatchVersion} parameter. If no parameter is given, it uses a prefix length of
+ * 5. In case you change to the more modern codepoint behaviour, reindexing may be required if your
+ * documents contain surrogate pairs (like emojis).
+ *
+ * <p>The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish:
  *
  * <pre><code class="language-xml">
  * &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
@@ -32,7 +46,7 @@
  *     &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
  *     &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
  *     &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
- *     &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
+ *     &lt;filter class="solr.TruncateTokenFilterFactory" truncateAfterCodePoints="5"/&gt;
  *     &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</code></pre>
@@ -45,27 +59,50 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory {
   /** SPI name */
   public static final String NAME = "truncate";
 
-  public static final String PREFIX_LENGTH_KEY = "prefixLength";
-  private final int prefixLength;
+  @Deprecated public static final String PREFIX_LENGTH_KEY = "prefixLength";
+  public static final String TRUNCATE_AFTER_CODEPOINTS_KEY = "truncateAfterCodePoints";
+  public static final String TRUNCATE_AFTER_CHARS_KEY = "truncateAfterChars";
+
+  private final int truncateAfter;
+  private final BiFunction<TokenStream, Integer, TruncateTokenFilter> factory;
 
   public TruncateTokenFilterFactory(Map<String, String> args) {
     super(args);
-    prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5"));
-    if (prefixLength < 1)
+    Map<String, BiFunction<TokenStream, Integer, TruncateTokenFilter>> paramMapping =
+        Map.of(
+            TRUNCATE_AFTER_CODEPOINTS_KEY, TruncateTokenFilter::truncateAfterCodePoints,
+            TRUNCATE_AFTER_CHARS_KEY, TruncateTokenFilter::truncateAfterChars,
+            PREFIX_LENGTH_KEY, this::legacyPrefixLengthFactory);
+    var avail = paramMapping.keySet().stream().filter(args::containsKey).toList();
+    if (avail.size() > 1) {
       throw new IllegalArgumentException(
-          PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
+          "Can only give one of the following parameters: " + paramMapping.keySet());
+    }
+    String param = avail.stream().findFirst().orElse(PREFIX_LENGTH_KEY);
+    this.truncateAfter = getInt(args, param, 5);
+    this.factory = paramMapping.get(param);
+    if (truncateAfter < 1) {
+      throw new IllegalArgumentException(
+          param + " parameter must be a positive number: " + truncateAfter);
+    }
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameter(s): " + args);
     }
   }
 
+  private TruncateTokenFilter legacyPrefixLengthFactory(TokenStream input, int prefixChars) {
+    return (luceneMatchVersion.onOrAfter(Version.LUCENE_10_5_0))
+        ? TruncateTokenFilter.truncateAfterCodePoints(input, prefixChars)
+        : TruncateTokenFilter.truncateAfterChars(input, prefixChars);
+  }
+
   /** Default ctor for compatibility with SPI */
   public TruncateTokenFilterFactory() {
     throw defaultCtorException();
   }
 
   @Override
   public TokenStream create(TokenStream input) {
-    return new TruncateTokenFilter(input, prefixLength);
+    return factory.apply(input, truncateAfter);
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
@@ -323,7 +323,8 @@ public void testMultipleConditionalFilters() throws IOException {
         new SkipMatchingFilter(
             stream,
             in -> {
-              TruncateTokenFilter truncateFilter = new TruncateTokenFilter(in, 2);
+              TruncateTokenFilter truncateFilter =
+                  TruncateTokenFilter.truncateAfterCodePoints(in, 2);
               return new AssertingLowerCaseFilter(truncateFilter);
             },
             ".*o.*");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java
@@ -16,22 +16,123 @@
  */
 package org.apache.lucene.analysis.miscellaneous;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+import org.apache.lucene.tests.util.TestUtil;
 import org.junit.Test;
 
 /** Test the truncate token filter. */
 public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
 
-  public void testTruncating() throws Exception {
-    TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
-    stream = new TruncateTokenFilter(stream, 5);
+  public void testLegacyTruncating() throws Exception {
+    TokenStream stream =
+        whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123 1234567 1234😃5");
+    stream = TruncateTokenFilter.truncateAfterChars(stream, 5);
     assertTokenStreamContents(
-        stream, new String[] {"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
+        stream,
+        new String[] {
+          "abcde",
+          "12345",
+          "ABCDE",
+          "abcde",
+          "abc",
+          "12345",
+          "123",
+          "12345",
+          "1234" + "😃".charAt(0)
+        });
+  }
+
+  public void testCodePointTruncating() throws Exception {
+    TokenStream stream =
+        whitespaceMockTokenizer(
+            "abcdefg 1234567 ABCDEFG abcde abc 12345 123 1234😃5 1 😃 😃12345 😃😃 😃😃😃 😃😃😃😃 😃😃😃😃😃 😃😃😃😃😃😃");
+    stream = TruncateTokenFilter.truncateAfterCodePoints(stream, 5);
+    assertTokenStreamContents(
+        stream,
+        new String[] {
+          "abcde",
+          "12345",
+          "ABCDE",
+          "abcde",
+          "abc",
+          "12345",
+          "123",
+          "1234😃",
+          "1",
+          "😃",
+          "😃1234",
+          "😃😃",
+          "😃😃😃",
+          "😃😃😃😃",
+          "😃😃😃😃😃",
+          "😃😃😃😃😃"
+        });
+  }
+
+  public void testRandom() throws Exception {
+    var rnd = random();
+    for (int i = 0; i < 50 * RANDOM_MULTIPLIER; i++) {
+      var truncateLength = rnd.nextInt(5) + 1;
+      String text = TestUtil.randomAnalysisString(rnd, 200, false);
+
+      TokenStream ts1 = whitespaceMockTokenizer(text);
+      CharTermAttribute termAtt1 = ts1.addAttribute(CharTermAttribute.class);
+      TokenStream ts2 =
+          TruncateTokenFilter.truncateAfterCodePoints(
+              whitespaceMockTokenizer(text), truncateLength);
+      CharTermAttribute termAtt2 = ts2.addAttribute(CharTermAttribute.class);
+
+      ts1.reset();
+      ts2.reset();
+      while (ts2.incrementToken()) {
+        assertTrue(ts1.incrementToken());
+        int len1 = Character.codePointCount(termAtt1, 0, termAtt1.length());
+        int len2 = Character.codePointCount(termAtt2, 0, termAtt2.length());
+        if (len1 <= truncateLength) {
+          assertEquals(len1, len2);
+          assertEquals(termAtt1.toString(), termAtt2.toString());
+        } else {
+          assertEquals(truncateLength, len2);
+          assertTrue(termAtt1.toString().startsWith(termAtt2.toString()));
+        }
+      }
+      assertFalse(ts1.incrementToken());
+      ts1.end();
+      ts2.end();
+      ts1.close();
+      ts2.close();
+    }
+  }
+
+  public void testStressRandom() throws Exception {
+    var rnd = random();
+    var truncateLength = rnd.nextInt(5) + 1;
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            return new TokenStreamComponents(
+                tokenizer, TruncateTokenFilter.truncateAfterCodePoints(tokenizer, truncateLength));
+          }
+        };
+    checkRandomData(rnd, a, 20 * RANDOM_MULTIPLIER, truncateLength * 2);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testLegacyNonPositiveLength() throws Exception {
+    TruncateTokenFilter.truncateAfterChars(
+        whitespaceMockTokenizer("param must be a positive number"), -48);
   }
 
   @Test(expected = IllegalArgumentException.class)
   public void testNonPositiveLength() throws Exception {
-    new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
+    TruncateTokenFilter.truncateAfterCodePoints(
+        whitespaceMockTokenizer("param must be a positive number"), -48);
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilterFactory.java