Skip to content

Commit 86ba05a

Browse files
msokolovuschindler
andauthored
Improve TruncateTokenFilter to truncate on codepoints (and no longer produce half surrogates) or legacy utf-16 chars (#15900)
Fix TruncateTokenFilter to count and trim on codepoints not chars. This fixes #15899 Co-authored-by: Uwe Schindler <[email protected]>
1 parent e6d3b52 commit 86ba05a

6 files changed

Lines changed: 317 additions & 61 deletions

File tree

lucene/CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,10 @@ Bug Fixes
329329
RefCountedSharedArena.DEFAULT_MAX_PERMITS instead of a hardcoded value, so that
330330
the default change from GITHUB#15078 takes effect. (Huaixinww)
331331

332+
* GITHUB#15899, GITHUB#15900: Improve TruncateTokenFilter to truncate on codepoints not
333+
chars and no longer produce half surrogates. There are new factory parameters available
334+
to configure legacy prefix chars and new codepoint behaviour. (Uwe Schindler, Michael Sokolov)
335+
332336
Other
333337
---------------------
334338
* GITHUB#15586: Document that scoring and ranking may change across major Lucene versions, and that applications requiring stable ranking should explicitly configure Similarity. (Parveen Saini)

lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,41 +16,101 @@
1616
*/
1717
package org.apache.lucene.analysis.miscellaneous;
1818

19+
import static java.lang.Character.isHighSurrogate;
20+
import static java.lang.Character.isLowSurrogate;
21+
1922
import java.io.IOException;
2023
import org.apache.lucene.analysis.TokenFilter;
2124
import org.apache.lucene.analysis.TokenStream;
2225
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2326
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
2427

2528
/**
26-
* A token filter for truncating the terms into a specific length. Fixed prefix truncation, as a
27-
* stemming method, produces good results on Turkish language. It is reported that F5, using first 5
28-
* characters, produced best results in <a
29-
* href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">Information Retrieval on
30-
* Turkish Texts</a>
29+
* A token filter for truncating the terms into a specific length (number of codepoints). Fixed
30+
* prefix truncation, as a stemming method, produces good results on Turkish language. It is
31+
* reported that F5, using first 5 characters, produced best results in <a
32+
* href="https://doi.org/10.1002/asi.20750">Information Retrieval on Turkish Texts</a>
33+
*
34+
* <p>Since Lucene 10.5, the filter is able to correctly handle codepoints and truncates after the
35+
* given number of codepoints, no longer producing incomplete surrogate pairs. Use the modern
36+
* factory method {@link #truncateAfterCodePoints(TokenStream, int)} to enable this mode. Legacy
37+
* behaviour is still available with {@link #truncateAfterChars(TokenStream, int)}
3138
*/
3239
public final class TruncateTokenFilter extends TokenFilter {
3340

3441
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
3542
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
3643

37-
private final int length;
44+
private final int truncateAfter;
45+
private final boolean useCodePoints;
46+
47+
/** Returns a filter with a prefix of {@code nCodePoints}. */
48+
public static TruncateTokenFilter truncateAfterCodePoints(TokenStream input, int nCodePoints) {
49+
return new TruncateTokenFilter(input, nCodePoints, true);
50+
}
51+
52+
/**
53+
* Returns a filter with a prefix of {@code nChars} Java Characters. This may split surrogate
54+
* pairs.
55+
*/
56+
public static TruncateTokenFilter truncateAfterChars(TokenStream input, int nChars) {
57+
return new TruncateTokenFilter(input, nChars, false);
58+
}
59+
60+
/**
61+
* Instantiates filter with a prefix of {@code nChars} Java Characters. This may split surrogate
62+
* pairs.
63+
*
64+
* @deprecated This constructor is deprecated, use {@link #truncateAfterChars(TokenStream, int)}
65+
* for backwards compatibility, or {@link #truncateAfterCodePoints(TokenStream, int)} to be
66+
* unicode conformant.
67+
*/
68+
@Deprecated
69+
public TruncateTokenFilter(TokenStream input, int nChars) {
70+
this(input, nChars, false);
71+
}
3872

39-
public TruncateTokenFilter(TokenStream input, int length) {
73+
private TruncateTokenFilter(TokenStream input, int truncateAfter, boolean useCodePoints) {
4074
super(input);
41-
if (length < 1)
42-
throw new IllegalArgumentException("length parameter must be a positive number: " + length);
43-
this.length = length;
75+
if (truncateAfter < 1) {
76+
throw new IllegalArgumentException(
77+
"truncateAfter parameter must be a positive number: " + truncateAfter);
78+
}
79+
this.truncateAfter = truncateAfter;
80+
this.useCodePoints = useCodePoints;
4481
}
4582

4683
@Override
4784
public final boolean incrementToken() throws IOException {
48-
if (input.incrementToken()) {
49-
if (!keywordAttr.isKeyword() && termAttribute.length() > length)
50-
termAttribute.setLength(length);
85+
if (!input.incrementToken()) {
86+
return false;
87+
}
88+
if (keywordAttr.isKeyword()) {
5189
return true;
90+
}
91+
final int len = termAttribute.length();
92+
if (len <= truncateAfter) {
93+
// the term is short enough, so we do not need to modify it
94+
// (works for both chars and codepoints)
95+
return true;
96+
}
97+
if (useCodePoints) {
98+
// code based on ICU4J's com.ibm.icu.text.UTF16#findOffsetFromCodePoint(...) implementation:
99+
final char[] arr = termAttribute.buffer();
100+
int ofs = 0, remaining = truncateAfter;
101+
while (ofs < len && remaining > 0) {
102+
if (isHighSurrogate(arr[ofs++]) && ofs < len && isLowSurrogate(arr[ofs])) {
103+
ofs++;
104+
}
105+
remaining--;
106+
}
107+
// check if we actually reached the limit and set new length based on calculated offset:
108+
if (remaining == 0) {
109+
termAttribute.setLength(ofs);
110+
}
52111
} else {
53-
return false;
112+
termAttribute.setLength(truncateAfter);
54113
}
114+
return true;
55115
}
56116
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterFactory.java

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,26 @@
1717
package org.apache.lucene.analysis.miscellaneous;
1818

1919
import java.util.Map;
20+
import java.util.function.BiFunction;
2021
import org.apache.lucene.analysis.TokenFilterFactory;
2122
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.util.Version;
2224

2325
/**
24-
* Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}. The following
25-
* type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
26+
* Factory for {@link org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter}.
27+
*
28+
* <p>Fixed prefix truncation, as a stemming method, produces good results on Turkish language. It
29+
* is reported that F5, using first 5 characters, produced best results in <a
30+
* href="https://doi.org/10.1002/asi.20750">Information Retrieval on Turkish Texts</a>
31+
*
32+
* <p>Since Lucene 10.5, the filter correctly handles codepoints and truncates after {@code
33+
* truncateAfterCodePoints} codepoints, no longer producing incomplete surrogate pairs. For
34+
* backwards compatibility the old {@code prefixLength} is still supported and its behaviour depends
35+
* on the {@code luceneMatchVersion} parameter. If no parameter is given, it uses a prefix length of
36+
* 5. In case you change to the more modern codepoint behaviour, reindexing may be required if your
37+
* documents contain surrogate pairs (like emojis).
38+
*
39+
* <p>The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish:
2640
*
2741
* <pre><code class="language-xml">
2842
* &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
@@ -32,7 +46,7 @@
3246
* &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
3347
* &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
3448
* &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
35-
* &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
49+
* &lt;filter class="solr.TruncateTokenFilterFactory" truncateAfterCodePoints="5"/&gt;
3650
* &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
3751
* &lt;/analyzer&gt;
3852
* &lt;/fieldType&gt;</code></pre>
@@ -45,27 +59,50 @@ public class TruncateTokenFilterFactory extends TokenFilterFactory {
4559
/** SPI name */
4660
public static final String NAME = "truncate";
4761

48-
public static final String PREFIX_LENGTH_KEY = "prefixLength";
49-
private final int prefixLength;
62+
@Deprecated public static final String PREFIX_LENGTH_KEY = "prefixLength";
63+
public static final String TRUNCATE_AFTER_CODEPOINTS_KEY = "truncateAfterCodePoints";
64+
public static final String TRUNCATE_AFTER_CHARS_KEY = "truncateAfterChars";
65+
66+
private final int truncateAfter;
67+
private final BiFunction<TokenStream, Integer, TruncateTokenFilter> factory;
5068

5169
public TruncateTokenFilterFactory(Map<String, String> args) {
5270
super(args);
53-
prefixLength = Integer.parseInt(get(args, PREFIX_LENGTH_KEY, "5"));
54-
if (prefixLength < 1)
71+
Map<String, BiFunction<TokenStream, Integer, TruncateTokenFilter>> paramMapping =
72+
Map.of(
73+
TRUNCATE_AFTER_CODEPOINTS_KEY, TruncateTokenFilter::truncateAfterCodePoints,
74+
TRUNCATE_AFTER_CHARS_KEY, TruncateTokenFilter::truncateAfterChars,
75+
PREFIX_LENGTH_KEY, this::legacyPrefixLengthFactory);
76+
var avail = paramMapping.keySet().stream().filter(args::containsKey).toList();
77+
if (avail.size() > 1) {
5578
throw new IllegalArgumentException(
56-
PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
79+
"Can only give one of the following parameters: " + paramMapping.keySet());
80+
}
81+
String param = avail.stream().findFirst().orElse(PREFIX_LENGTH_KEY);
82+
this.truncateAfter = getInt(args, param, 5);
83+
this.factory = paramMapping.get(param);
84+
if (truncateAfter < 1) {
85+
throw new IllegalArgumentException(
86+
param + " parameter must be a positive number: " + truncateAfter);
87+
}
5788
if (!args.isEmpty()) {
5889
throw new IllegalArgumentException("Unknown parameter(s): " + args);
5990
}
6091
}
6192

93+
private TruncateTokenFilter legacyPrefixLengthFactory(TokenStream input, int prefixChars) {
94+
return (luceneMatchVersion.onOrAfter(Version.LUCENE_10_5_0))
95+
? TruncateTokenFilter.truncateAfterCodePoints(input, prefixChars)
96+
: TruncateTokenFilter.truncateAfterChars(input, prefixChars);
97+
}
98+
6299
/** Default ctor for compatibility with SPI */
63100
public TruncateTokenFilterFactory() {
64101
throw defaultCtorException();
65102
}
66103

67104
@Override
68105
public TokenStream create(TokenStream input) {
69-
return new TruncateTokenFilter(input, prefixLength);
106+
return factory.apply(input, truncateAfter);
70107
}
71108
}

lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,8 @@ public void testMultipleConditionalFilters() throws IOException {
323323
new SkipMatchingFilter(
324324
stream,
325325
in -> {
326-
TruncateTokenFilter truncateFilter = new TruncateTokenFilter(in, 2);
326+
TruncateTokenFilter truncateFilter =
327+
TruncateTokenFilter.truncateAfterCodePoints(in, 2);
327328
return new AssertingLowerCaseFilter(truncateFilter);
328329
},
329330
".*o.*");

lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTruncateTokenFilter.java

Lines changed: 106 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,123 @@
1616
*/
1717
package org.apache.lucene.analysis.miscellaneous;
1818

19+
import org.apache.lucene.analysis.Analyzer;
1920
import org.apache.lucene.analysis.TokenStream;
21+
import org.apache.lucene.analysis.Tokenizer;
22+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2023
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
24+
import org.apache.lucene.tests.analysis.MockTokenizer;
25+
import org.apache.lucene.tests.util.TestUtil;
2126
import org.junit.Test;
2227

2328
/** Test the truncate token filter. */
2429
public class TestTruncateTokenFilter extends BaseTokenStreamTestCase {
2530

26-
public void testTruncating() throws Exception {
27-
TokenStream stream = whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123");
28-
stream = new TruncateTokenFilter(stream, 5);
31+
public void testLegacyTruncating() throws Exception {
32+
TokenStream stream =
33+
whitespaceMockTokenizer("abcdefg 1234567 ABCDEFG abcde abc 12345 123 1234567 1234😃5");
34+
stream = TruncateTokenFilter.truncateAfterChars(stream, 5);
2935
assertTokenStreamContents(
30-
stream, new String[] {"abcde", "12345", "ABCDE", "abcde", "abc", "12345", "123"});
36+
stream,
37+
new String[] {
38+
"abcde",
39+
"12345",
40+
"ABCDE",
41+
"abcde",
42+
"abc",
43+
"12345",
44+
"123",
45+
"12345",
46+
"1234" + "😃".charAt(0)
47+
});
48+
}
49+
50+
public void testCodePointTruncating() throws Exception {
51+
TokenStream stream =
52+
whitespaceMockTokenizer(
53+
"abcdefg 1234567 ABCDEFG abcde abc 12345 123 1234😃5 1 😃 😃12345 😃😃 😃😃😃 😃😃😃😃 😃😃😃😃😃 😃😃😃😃😃😃");
54+
stream = TruncateTokenFilter.truncateAfterCodePoints(stream, 5);
55+
assertTokenStreamContents(
56+
stream,
57+
new String[] {
58+
"abcde",
59+
"12345",
60+
"ABCDE",
61+
"abcde",
62+
"abc",
63+
"12345",
64+
"123",
65+
"1234😃",
66+
"1",
67+
"😃",
68+
"😃1234",
69+
"😃😃",
70+
"😃😃😃",
71+
"😃😃😃😃",
72+
"😃😃😃😃😃",
73+
"😃😃😃😃😃"
74+
});
75+
}
76+
77+
public void testRandom() throws Exception {
78+
var rnd = random();
79+
for (int i = 0; i < 50 * RANDOM_MULTIPLIER; i++) {
80+
var truncateLength = rnd.nextInt(5) + 1;
81+
String text = TestUtil.randomAnalysisString(rnd, 200, false);
82+
83+
TokenStream ts1 = whitespaceMockTokenizer(text);
84+
CharTermAttribute termAtt1 = ts1.addAttribute(CharTermAttribute.class);
85+
TokenStream ts2 =
86+
TruncateTokenFilter.truncateAfterCodePoints(
87+
whitespaceMockTokenizer(text), truncateLength);
88+
CharTermAttribute termAtt2 = ts2.addAttribute(CharTermAttribute.class);
89+
90+
ts1.reset();
91+
ts2.reset();
92+
while (ts2.incrementToken()) {
93+
assertTrue(ts1.incrementToken());
94+
int len1 = Character.codePointCount(termAtt1, 0, termAtt1.length());
95+
int len2 = Character.codePointCount(termAtt2, 0, termAtt2.length());
96+
if (len1 <= truncateLength) {
97+
assertEquals(len1, len2);
98+
assertEquals(termAtt1.toString(), termAtt2.toString());
99+
} else {
100+
assertEquals(truncateLength, len2);
101+
assertTrue(termAtt1.toString().startsWith(termAtt2.toString()));
102+
}
103+
}
104+
assertFalse(ts1.incrementToken());
105+
ts1.end();
106+
ts2.end();
107+
ts1.close();
108+
ts2.close();
109+
}
110+
}
111+
112+
public void testStressRandom() throws Exception {
113+
var rnd = random();
114+
var truncateLength = rnd.nextInt(5) + 1;
115+
Analyzer a =
116+
new Analyzer() {
117+
@Override
118+
protected TokenStreamComponents createComponents(String fieldName) {
119+
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
120+
return new TokenStreamComponents(
121+
tokenizer, TruncateTokenFilter.truncateAfterCodePoints(tokenizer, truncateLength));
122+
}
123+
};
124+
checkRandomData(rnd, a, 20 * RANDOM_MULTIPLIER, truncateLength * 2);
125+
}
126+
127+
@Test(expected = IllegalArgumentException.class)
128+
public void testLegacyNonPositiveLength() throws Exception {
129+
TruncateTokenFilter.truncateAfterChars(
130+
whitespaceMockTokenizer("param must be a positive number"), -48);
31131
}
32132

33133
@Test(expected = IllegalArgumentException.class)
34134
public void testNonPositiveLength() throws Exception {
35-
new TruncateTokenFilter(whitespaceMockTokenizer("length must be a positive number"), -48);
135+
TruncateTokenFilter.truncateAfterCodePoints(
136+
whitespaceMockTokenizer("param must be a positive number"), -48);
36137
}
37138
}

0 commit comments

Comments
 (0)