N-gram token filter
editN-gram token filter
editForms n-grams of specified lengths from a token.
For example, you can use the ngram token filter to change fox to
[ f, fo, o, ox, x ].
This filter uses Lucene’s NGramTokenFilter.
The ngram filter is similar to the
edge_ngram token filter. However, the
edge_ngram only outputs n-grams that start at the beginning of a token.
Example
editThe following analyze API request uses the ngram
filter to convert Quick fox to 1-character and 2-character n-grams:
resp = client.indices.analyze(
tokenizer="standard",
filter=[
"ngram"
],
text="Quick fox",
)
print(resp)
response = client.indices.analyze(
body: {
tokenizer: 'standard',
filter: [
'ngram'
],
text: 'Quick fox'
}
)
puts response
const response = await client.indices.analyze({
tokenizer: "standard",
filter: ["ngram"],
text: "Quick fox",
});
console.log(response);
GET _analyze
{
"tokenizer": "standard",
"filter": [ "ngram" ],
"text": "Quick fox"
}
The filter produces the following tokens:
[ Q, Qu, u, ui, i, ic, c, ck, k, f, fo, o, ox, x ]
Add to an analyzer
editThe following create index API request uses the ngram
filter to configure a new custom analyzer.
resp = client.indices.create(
index="ngram_example",
settings={
"analysis": {
"analyzer": {
"standard_ngram": {
"tokenizer": "standard",
"filter": [
"ngram"
]
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'ngram_example',
body: {
settings: {
analysis: {
analyzer: {
standard_ngram: {
tokenizer: 'standard',
filter: [
'ngram'
]
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "ngram_example",
settings: {
analysis: {
analyzer: {
standard_ngram: {
tokenizer: "standard",
filter: ["ngram"],
},
},
},
},
});
console.log(response);
PUT ngram_example
{
"settings": {
"analysis": {
"analyzer": {
"standard_ngram": {
"tokenizer": "standard",
"filter": [ "ngram" ]
}
}
}
}
}
Configurable parameters
edit-
max_gram -
(Optional, integer)
Maximum length of characters in a gram. Defaults to
2. -
min_gram -
(Optional, integer)
Minimum length of characters in a gram. Defaults to
1. -
preserve_original -
(Optional, Boolean)
Emits original token when set to
true. Defaults tofalse.
You can use the index.max_ngram_diff index-level
setting to control the maximum allowed difference between the max_gram and
min_gram values.
Customize
editTo customize the ngram filter, duplicate it to create the basis for a new
custom token filter. You can modify the filter using its configurable
parameters.
For example, the following request creates a custom ngram filter that forms
n-grams between 3-5 characters. The request also increases the
index.max_ngram_diff setting to 2.
resp = client.indices.create(
index="ngram_custom_example",
settings={
"index": {
"max_ngram_diff": 2
},
"analysis": {
"analyzer": {
"default": {
"tokenizer": "whitespace",
"filter": [
"3_5_grams"
]
}
},
"filter": {
"3_5_grams": {
"type": "ngram",
"min_gram": 3,
"max_gram": 5
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'ngram_custom_example',
body: {
settings: {
index: {
max_ngram_diff: 2
},
analysis: {
analyzer: {
default: {
tokenizer: 'whitespace',
filter: [
'3_5_grams'
]
}
},
filter: {
"3_5_grams": {
type: 'ngram',
min_gram: 3,
max_gram: 5
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "ngram_custom_example",
settings: {
index: {
max_ngram_diff: 2,
},
analysis: {
analyzer: {
default: {
tokenizer: "whitespace",
filter: ["3_5_grams"],
},
},
filter: {
"3_5_grams": {
type: "ngram",
min_gram: 3,
max_gram: 5,
},
},
},
},
});
console.log(response);
PUT ngram_custom_example
{
"settings": {
"index": {
"max_ngram_diff": 2
},
"analysis": {
"analyzer": {
"default": {
"tokenizer": "whitespace",
"filter": [ "3_5_grams" ]
}
},
"filter": {
"3_5_grams": {
"type": "ngram",
"min_gram": 3,
"max_gram": 5
}
}
}
}
}