Language analyzers
editLanguage analyzers
editA set of analyzers aimed at analyzing specific language text. The
following types are supported:
arabic,
armenian,
basque,
bengali,
brazilian,
bulgarian,
catalan,
cjk,
czech,
danish,
dutch,
english,
estonian,
finnish,
french,
galician,
german,
greek,
hindi,
hungarian,
indonesian,
irish,
italian,
latvian,
lithuanian,
norwegian,
persian,
portuguese,
romanian,
russian,
serbian,
sorani,
spanish,
swedish,
turkish,
thai.
Configuring language analyzers
editStopwords
editAll analyzers support setting custom stopwords either internally in
the config, or by using an external stopwords file by setting
stopwords_path. Check Stop Analyzer for
more details.
Excluding words from stemming
editThe stem_exclusion parameter allows you to specify an array
of lowercase words that should not be stemmed. Internally, this
functionality is implemented by adding the
keyword_marker token filter
with the keywords set to the value of the stem_exclusion parameter.
The following analyzers support setting custom stem_exclusion list:
arabic, armenian, basque, bengali, bulgarian, catalan, czech,
dutch, english, finnish, french, galician,
german, hindi, hungarian, indonesian, irish, italian, latvian,
lithuanian, norwegian, portuguese, romanian, russian, serbian,
sorani, spanish, swedish, turkish.
Reimplementing language analyzers
editThe built-in language analyzers can be reimplemented as custom analyzers
(as described below) in order to customize their behaviour.
If you do not intend to exclude words from being stemmed (the
equivalent of the stem_exclusion parameter above), then you should remove
the keyword_marker token filter from the custom analyzer configuration.
arabic analyzer
editThe arabic analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'arabic_example',
body: {
settings: {
analysis: {
filter: {
arabic_stop: {
type: 'stop',
stopwords: '_arabic_'
},
arabic_keywords: {
type: 'keyword_marker',
keywords: [
'مثال'
]
},
arabic_stemmer: {
type: 'stemmer',
language: 'arabic'
}
},
analyzer: {
rebuilt_arabic: {
tokenizer: 'standard',
filter: [
'lowercase',
'decimal_digit',
'arabic_stop',
'arabic_normalization',
'arabic_keywords',
'arabic_stemmer'
]
}
}
}
}
}
)
puts response
PUT /arabic_example
{
"settings": {
"analysis": {
"filter": {
"arabic_stop": {
"type": "stop",
"stopwords": "_arabic_"
},
"arabic_keywords": {
"type": "keyword_marker",
"keywords": ["مثال"]
},
"arabic_stemmer": {
"type": "stemmer",
"language": "arabic"
}
},
"analyzer": {
"rebuilt_arabic": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"arabic_stop",
"arabic_normalization",
"arabic_keywords",
"arabic_stemmer"
]
}
}
}
}
}
armenian analyzer
editThe armenian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'armenian_example',
body: {
settings: {
analysis: {
filter: {
armenian_stop: {
type: 'stop',
stopwords: '_armenian_'
},
armenian_keywords: {
type: 'keyword_marker',
keywords: [
'օրինակ'
]
},
armenian_stemmer: {
type: 'stemmer',
language: 'armenian'
}
},
analyzer: {
rebuilt_armenian: {
tokenizer: 'standard',
filter: [
'lowercase',
'armenian_stop',
'armenian_keywords',
'armenian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /armenian_example
{
"settings": {
"analysis": {
"filter": {
"armenian_stop": {
"type": "stop",
"stopwords": "_armenian_"
},
"armenian_keywords": {
"type": "keyword_marker",
"keywords": ["օրինակ"]
},
"armenian_stemmer": {
"type": "stemmer",
"language": "armenian"
}
},
"analyzer": {
"rebuilt_armenian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"armenian_stop",
"armenian_keywords",
"armenian_stemmer"
]
}
}
}
}
}
basque analyzer
editThe basque analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'basque_example',
body: {
settings: {
analysis: {
filter: {
basque_stop: {
type: 'stop',
stopwords: '_basque_'
},
basque_keywords: {
type: 'keyword_marker',
keywords: [
'Adibidez'
]
},
basque_stemmer: {
type: 'stemmer',
language: 'basque'
}
},
analyzer: {
rebuilt_basque: {
tokenizer: 'standard',
filter: [
'lowercase',
'basque_stop',
'basque_keywords',
'basque_stemmer'
]
}
}
}
}
}
)
puts response
PUT /basque_example
{
"settings": {
"analysis": {
"filter": {
"basque_stop": {
"type": "stop",
"stopwords": "_basque_"
},
"basque_keywords": {
"type": "keyword_marker",
"keywords": ["Adibidez"]
},
"basque_stemmer": {
"type": "stemmer",
"language": "basque"
}
},
"analyzer": {
"rebuilt_basque": {
"tokenizer": "standard",
"filter": [
"lowercase",
"basque_stop",
"basque_keywords",
"basque_stemmer"
]
}
}
}
}
}
bengali analyzer
editThe bengali analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'bengali_example',
body: {
settings: {
analysis: {
filter: {
bengali_stop: {
type: 'stop',
stopwords: '_bengali_'
},
bengali_keywords: {
type: 'keyword_marker',
keywords: [
'উদাহরণ'
]
},
bengali_stemmer: {
type: 'stemmer',
language: 'bengali'
}
},
analyzer: {
rebuilt_bengali: {
tokenizer: 'standard',
filter: [
'lowercase',
'decimal_digit',
'bengali_keywords',
'indic_normalization',
'bengali_normalization',
'bengali_stop',
'bengali_stemmer'
]
}
}
}
}
}
)
puts response
PUT /bengali_example
{
"settings": {
"analysis": {
"filter": {
"bengali_stop": {
"type": "stop",
"stopwords": "_bengali_"
},
"bengali_keywords": {
"type": "keyword_marker",
"keywords": ["উদাহরণ"]
},
"bengali_stemmer": {
"type": "stemmer",
"language": "bengali"
}
},
"analyzer": {
"rebuilt_bengali": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"bengali_keywords",
"indic_normalization",
"bengali_normalization",
"bengali_stop",
"bengali_stemmer"
]
}
}
}
}
}
brazilian analyzer
editThe brazilian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'brazilian_example',
body: {
settings: {
analysis: {
filter: {
brazilian_stop: {
type: 'stop',
stopwords: '_brazilian_'
},
brazilian_keywords: {
type: 'keyword_marker',
keywords: [
'exemplo'
]
},
brazilian_stemmer: {
type: 'stemmer',
language: 'brazilian'
}
},
analyzer: {
rebuilt_brazilian: {
tokenizer: 'standard',
filter: [
'lowercase',
'brazilian_stop',
'brazilian_keywords',
'brazilian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /brazilian_example
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"brazilian_stemmer": {
"type": "stemmer",
"language": "brazilian"
}
},
"analyzer": {
"rebuilt_brazilian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"brazilian_stop",
"brazilian_keywords",
"brazilian_stemmer"
]
}
}
}
}
}
bulgarian analyzer
editThe bulgarian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'bulgarian_example',
body: {
settings: {
analysis: {
filter: {
bulgarian_stop: {
type: 'stop',
stopwords: '_bulgarian_'
},
bulgarian_keywords: {
type: 'keyword_marker',
keywords: [
'пример'
]
},
bulgarian_stemmer: {
type: 'stemmer',
language: 'bulgarian'
}
},
analyzer: {
rebuilt_bulgarian: {
tokenizer: 'standard',
filter: [
'lowercase',
'bulgarian_stop',
'bulgarian_keywords',
'bulgarian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /bulgarian_example
{
"settings": {
"analysis": {
"filter": {
"bulgarian_stop": {
"type": "stop",
"stopwords": "_bulgarian_"
},
"bulgarian_keywords": {
"type": "keyword_marker",
"keywords": ["пример"]
},
"bulgarian_stemmer": {
"type": "stemmer",
"language": "bulgarian"
}
},
"analyzer": {
"rebuilt_bulgarian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"bulgarian_stop",
"bulgarian_keywords",
"bulgarian_stemmer"
]
}
}
}
}
}
catalan analyzer
editThe catalan analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'catalan_example',
body: {
settings: {
analysis: {
filter: {
catalan_elision: {
type: 'elision',
articles: [
'd',
'l',
'm',
'n',
's',
't'
],
articles_case: true
},
catalan_stop: {
type: 'stop',
stopwords: '_catalan_'
},
catalan_keywords: {
type: 'keyword_marker',
keywords: [
'example'
]
},
catalan_stemmer: {
type: 'stemmer',
language: 'catalan'
}
},
analyzer: {
rebuilt_catalan: {
tokenizer: 'standard',
filter: [
'catalan_elision',
'lowercase',
'catalan_stop',
'catalan_keywords',
'catalan_stemmer'
]
}
}
}
}
}
)
puts response
PUT /catalan_example
{
"settings": {
"analysis": {
"filter": {
"catalan_elision": {
"type": "elision",
"articles": [ "d", "l", "m", "n", "s", "t"],
"articles_case": true
},
"catalan_stop": {
"type": "stop",
"stopwords": "_catalan_"
},
"catalan_keywords": {
"type": "keyword_marker",
"keywords": ["example"]
},
"catalan_stemmer": {
"type": "stemmer",
"language": "catalan"
}
},
"analyzer": {
"rebuilt_catalan": {
"tokenizer": "standard",
"filter": [
"catalan_elision",
"lowercase",
"catalan_stop",
"catalan_keywords",
"catalan_stemmer"
]
}
}
}
}
}
cjk analyzer
editYou may find that icu_analyzer in the ICU analysis plugin works better
for CJK text than the cjk analyzer. Experiment with your text and queries.
The cjk analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'cjk_example',
body: {
settings: {
analysis: {
filter: {
english_stop: {
type: 'stop',
stopwords: [
'a',
'and',
'are',
'as',
'at',
'be',
'but',
'by',
'for',
'if',
'in',
'into',
'is',
'it',
'no',
'not',
'of',
'on',
'or',
's',
'such',
't',
'that',
'the',
'their',
'then',
'there',
'these',
'they',
'this',
'to',
'was',
'will',
'with',
'www'
]
}
},
analyzer: {
rebuilt_cjk: {
tokenizer: 'standard',
filter: [
'cjk_width',
'lowercase',
'cjk_bigram',
'english_stop'
]
}
}
}
}
}
)
puts response
PUT /cjk_example
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": [
"a", "and", "are", "as", "at", "be", "but", "by", "for",
"if", "in", "into", "is", "it", "no", "not", "of", "on",
"or", "s", "such", "t", "that", "the", "their", "then",
"there", "these", "they", "this", "to", "was", "will",
"with", "www"
]
}
},
"analyzer": {
"rebuilt_cjk": {
"tokenizer": "standard",
"filter": [
"cjk_width",
"lowercase",
"cjk_bigram",
"english_stop"
]
}
}
}
}
}
czech analyzer
editThe czech analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'czech_example',
body: {
settings: {
analysis: {
filter: {
czech_stop: {
type: 'stop',
stopwords: '_czech_'
},
czech_keywords: {
type: 'keyword_marker',
keywords: [
'příklad'
]
},
czech_stemmer: {
type: 'stemmer',
language: 'czech'
}
},
analyzer: {
rebuilt_czech: {
tokenizer: 'standard',
filter: [
'lowercase',
'czech_stop',
'czech_keywords',
'czech_stemmer'
]
}
}
}
}
}
)
puts response
PUT /czech_example
{
"settings": {
"analysis": {
"filter": {
"czech_stop": {
"type": "stop",
"stopwords": "_czech_"
},
"czech_keywords": {
"type": "keyword_marker",
"keywords": ["příklad"]
},
"czech_stemmer": {
"type": "stemmer",
"language": "czech"
}
},
"analyzer": {
"rebuilt_czech": {
"tokenizer": "standard",
"filter": [
"lowercase",
"czech_stop",
"czech_keywords",
"czech_stemmer"
]
}
}
}
}
}
danish analyzer
editThe danish analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'danish_example',
body: {
settings: {
analysis: {
filter: {
danish_stop: {
type: 'stop',
stopwords: '_danish_'
},
danish_keywords: {
type: 'keyword_marker',
keywords: [
'eksempel'
]
},
danish_stemmer: {
type: 'stemmer',
language: 'danish'
}
},
analyzer: {
rebuilt_danish: {
tokenizer: 'standard',
filter: [
'lowercase',
'danish_stop',
'danish_keywords',
'danish_stemmer'
]
}
}
}
}
}
)
puts response
PUT /danish_example
{
"settings": {
"analysis": {
"filter": {
"danish_stop": {
"type": "stop",
"stopwords": "_danish_"
},
"danish_keywords": {
"type": "keyword_marker",
"keywords": ["eksempel"]
},
"danish_stemmer": {
"type": "stemmer",
"language": "danish"
}
},
"analyzer": {
"rebuilt_danish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"danish_stop",
"danish_keywords",
"danish_stemmer"
]
}
}
}
}
}
dutch analyzer
editThe dutch analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'dutch_example',
body: {
settings: {
analysis: {
filter: {
dutch_stop: {
type: 'stop',
stopwords: '_dutch_'
},
dutch_keywords: {
type: 'keyword_marker',
keywords: [
'voorbeeld'
]
},
dutch_stemmer: {
type: 'stemmer',
language: 'dutch'
},
dutch_override: {
type: 'stemmer_override',
rules: [
'fiets=>fiets',
'bromfiets=>bromfiets',
'ei=>eier',
'kind=>kinder'
]
}
},
analyzer: {
rebuilt_dutch: {
tokenizer: 'standard',
filter: [
'lowercase',
'dutch_stop',
'dutch_keywords',
'dutch_override',
'dutch_stemmer'
]
}
}
}
}
}
)
puts response
PUT /dutch_example
{
"settings": {
"analysis": {
"filter": {
"dutch_stop": {
"type": "stop",
"stopwords": "_dutch_"
},
"dutch_keywords": {
"type": "keyword_marker",
"keywords": ["voorbeeld"]
},
"dutch_stemmer": {
"type": "stemmer",
"language": "dutch"
},
"dutch_override": {
"type": "stemmer_override",
"rules": [
"fiets=>fiets",
"bromfiets=>bromfiets",
"ei=>eier",
"kind=>kinder"
]
}
},
"analyzer": {
"rebuilt_dutch": {
"tokenizer": "standard",
"filter": [
"lowercase",
"dutch_stop",
"dutch_keywords",
"dutch_override",
"dutch_stemmer"
]
}
}
}
}
}
english analyzer
editThe english analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'english_example',
body: {
settings: {
analysis: {
filter: {
english_stop: {
type: 'stop',
stopwords: '_english_'
},
english_keywords: {
type: 'keyword_marker',
keywords: [
'example'
]
},
english_stemmer: {
type: 'stemmer',
language: 'english'
},
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english'
}
},
analyzer: {
rebuilt_english: {
tokenizer: 'standard',
filter: [
'english_possessive_stemmer',
'lowercase',
'english_stop',
'english_keywords',
'english_stemmer'
]
}
}
}
}
}
)
puts response
PUT /english_example
{
"settings": {
"analysis": {
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": ["example"]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
}
},
"analyzer": {
"rebuilt_english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
}
}
estonian analyzer
editThe estonian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'estonian_example',
body: {
settings: {
analysis: {
filter: {
estonian_stop: {
type: 'stop',
stopwords: '_estonian_'
},
estonian_keywords: {
type: 'keyword_marker',
keywords: [
'näide'
]
},
estonian_stemmer: {
type: 'stemmer',
language: 'estonian'
}
},
analyzer: {
rebuilt_estonian: {
tokenizer: 'standard',
filter: [
'lowercase',
'estonian_stop',
'estonian_keywords',
'estonian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /estonian_example
{
"settings": {
"analysis": {
"filter": {
"estonian_stop": {
"type": "stop",
"stopwords": "_estonian_"
},
"estonian_keywords": {
"type": "keyword_marker",
"keywords": ["näide"]
},
"estonian_stemmer": {
"type": "stemmer",
"language": "estonian"
}
},
"analyzer": {
"rebuilt_estonian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"estonian_stop",
"estonian_keywords",
"estonian_stemmer"
]
}
}
}
}
}
finnish analyzer
editThe finnish analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'finnish_example',
body: {
settings: {
analysis: {
filter: {
finnish_stop: {
type: 'stop',
stopwords: '_finnish_'
},
finnish_keywords: {
type: 'keyword_marker',
keywords: [
'esimerkki'
]
},
finnish_stemmer: {
type: 'stemmer',
language: 'finnish'
}
},
analyzer: {
rebuilt_finnish: {
tokenizer: 'standard',
filter: [
'lowercase',
'finnish_stop',
'finnish_keywords',
'finnish_stemmer'
]
}
}
}
}
}
)
puts response
PUT /finnish_example
{
"settings": {
"analysis": {
"filter": {
"finnish_stop": {
"type": "stop",
"stopwords": "_finnish_"
},
"finnish_keywords": {
"type": "keyword_marker",
"keywords": ["esimerkki"]
},
"finnish_stemmer": {
"type": "stemmer",
"language": "finnish"
}
},
"analyzer": {
"rebuilt_finnish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"finnish_stop",
"finnish_keywords",
"finnish_stemmer"
]
}
}
}
}
}
french analyzer
editThe french analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'french_example',
body: {
settings: {
analysis: {
filter: {
french_elision: {
type: 'elision',
articles_case: true,
articles: [
'l',
'm',
't',
'qu',
'n',
's',
'j',
'd',
'c',
'jusqu',
'quoiqu',
'lorsqu',
'puisqu'
]
},
french_stop: {
type: 'stop',
stopwords: '_french_'
},
french_keywords: {
type: 'keyword_marker',
keywords: [
'Example'
]
},
french_stemmer: {
type: 'stemmer',
language: 'light_french'
}
},
analyzer: {
rebuilt_french: {
tokenizer: 'standard',
filter: [
'french_elision',
'lowercase',
'french_stop',
'french_keywords',
'french_stemmer'
]
}
}
}
}
}
)
puts response
PUT /french_example
{
"settings": {
"analysis": {
"filter": {
"french_elision": {
"type": "elision",
"articles_case": true,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"french_keywords": {
"type": "keyword_marker",
"keywords": ["Example"]
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
}
},
"analyzer": {
"rebuilt_french": {
"tokenizer": "standard",
"filter": [
"french_elision",
"lowercase",
"french_stop",
"french_keywords",
"french_stemmer"
]
}
}
}
}
}
galician analyzer
editThe galician analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'galician_example',
body: {
settings: {
analysis: {
filter: {
galician_stop: {
type: 'stop',
stopwords: '_galician_'
},
galician_keywords: {
type: 'keyword_marker',
keywords: [
'exemplo'
]
},
galician_stemmer: {
type: 'stemmer',
language: 'galician'
}
},
analyzer: {
rebuilt_galician: {
tokenizer: 'standard',
filter: [
'lowercase',
'galician_stop',
'galician_keywords',
'galician_stemmer'
]
}
}
}
}
}
)
puts response
PUT /galician_example
{
"settings": {
"analysis": {
"filter": {
"galician_stop": {
"type": "stop",
"stopwords": "_galician_"
},
"galician_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"galician_stemmer": {
"type": "stemmer",
"language": "galician"
}
},
"analyzer": {
"rebuilt_galician": {
"tokenizer": "standard",
"filter": [
"lowercase",
"galician_stop",
"galician_keywords",
"galician_stemmer"
]
}
}
}
}
}
german analyzer
editThe german analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'german_example',
body: {
settings: {
analysis: {
filter: {
german_stop: {
type: 'stop',
stopwords: '_german_'
},
german_keywords: {
type: 'keyword_marker',
keywords: [
'Beispiel'
]
},
german_stemmer: {
type: 'stemmer',
language: 'light_german'
}
},
analyzer: {
rebuilt_german: {
tokenizer: 'standard',
filter: [
'lowercase',
'german_stop',
'german_keywords',
'german_normalization',
'german_stemmer'
]
}
}
}
}
}
)
puts response
PUT /german_example
{
"settings": {
"analysis": {
"filter": {
"german_stop": {
"type": "stop",
"stopwords": "_german_"
},
"german_keywords": {
"type": "keyword_marker",
"keywords": ["Beispiel"]
},
"german_stemmer": {
"type": "stemmer",
"language": "light_german"
}
},
"analyzer": {
"rebuilt_german": {
"tokenizer": "standard",
"filter": [
"lowercase",
"german_stop",
"german_keywords",
"german_normalization",
"german_stemmer"
]
}
}
}
}
}
greek analyzer
editThe greek analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'greek_example',
body: {
settings: {
analysis: {
filter: {
greek_stop: {
type: 'stop',
stopwords: '_greek_'
},
greek_lowercase: {
type: 'lowercase',
language: 'greek'
},
greek_keywords: {
type: 'keyword_marker',
keywords: [
'παράδειγμα'
]
},
greek_stemmer: {
type: 'stemmer',
language: 'greek'
}
},
analyzer: {
rebuilt_greek: {
tokenizer: 'standard',
filter: [
'greek_lowercase',
'greek_stop',
'greek_keywords',
'greek_stemmer'
]
}
}
}
}
}
)
puts response
PUT /greek_example
{
"settings": {
"analysis": {
"filter": {
"greek_stop": {
"type": "stop",
"stopwords": "_greek_"
},
"greek_lowercase": {
"type": "lowercase",
"language": "greek"
},
"greek_keywords": {
"type": "keyword_marker",
"keywords": ["παράδειγμα"]
},
"greek_stemmer": {
"type": "stemmer",
"language": "greek"
}
},
"analyzer": {
"rebuilt_greek": {
"tokenizer": "standard",
"filter": [
"greek_lowercase",
"greek_stop",
"greek_keywords",
"greek_stemmer"
]
}
}
}
}
}
hindi analyzer
editThe hindi analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'hindi_example',
body: {
settings: {
analysis: {
filter: {
hindi_stop: {
type: 'stop',
stopwords: '_hindi_'
},
hindi_keywords: {
type: 'keyword_marker',
keywords: [
'उदाहरण'
]
},
hindi_stemmer: {
type: 'stemmer',
language: 'hindi'
}
},
analyzer: {
rebuilt_hindi: {
tokenizer: 'standard',
filter: [
'lowercase',
'decimal_digit',
'hindi_keywords',
'indic_normalization',
'hindi_normalization',
'hindi_stop',
'hindi_stemmer'
]
}
}
}
}
}
)
puts response
PUT /hindi_example
{
"settings": {
"analysis": {
"filter": {
"hindi_stop": {
"type": "stop",
"stopwords": "_hindi_"
},
"hindi_keywords": {
"type": "keyword_marker",
"keywords": ["उदाहरण"]
},
"hindi_stemmer": {
"type": "stemmer",
"language": "hindi"
}
},
"analyzer": {
"rebuilt_hindi": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"hindi_keywords",
"indic_normalization",
"hindi_normalization",
"hindi_stop",
"hindi_stemmer"
]
}
}
}
}
}
hungarian analyzer
editThe hungarian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'hungarian_example',
body: {
settings: {
analysis: {
filter: {
hungarian_stop: {
type: 'stop',
stopwords: '_hungarian_'
},
hungarian_keywords: {
type: 'keyword_marker',
keywords: [
'példa'
]
},
hungarian_stemmer: {
type: 'stemmer',
language: 'hungarian'
}
},
analyzer: {
rebuilt_hungarian: {
tokenizer: 'standard',
filter: [
'lowercase',
'hungarian_stop',
'hungarian_keywords',
'hungarian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /hungarian_example
{
"settings": {
"analysis": {
"filter": {
"hungarian_stop": {
"type": "stop",
"stopwords": "_hungarian_"
},
"hungarian_keywords": {
"type": "keyword_marker",
"keywords": ["példa"]
},
"hungarian_stemmer": {
"type": "stemmer",
"language": "hungarian"
}
},
"analyzer": {
"rebuilt_hungarian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"hungarian_stop",
"hungarian_keywords",
"hungarian_stemmer"
]
}
}
}
}
}
indonesian analyzer
editThe indonesian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'indonesian_example',
body: {
settings: {
analysis: {
filter: {
indonesian_stop: {
type: 'stop',
stopwords: '_indonesian_'
},
indonesian_keywords: {
type: 'keyword_marker',
keywords: [
'contoh'
]
},
indonesian_stemmer: {
type: 'stemmer',
language: 'indonesian'
}
},
analyzer: {
rebuilt_indonesian: {
tokenizer: 'standard',
filter: [
'lowercase',
'indonesian_stop',
'indonesian_keywords',
'indonesian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /indonesian_example
{
"settings": {
"analysis": {
"filter": {
"indonesian_stop": {
"type": "stop",
"stopwords": "_indonesian_"
},
"indonesian_keywords": {
"type": "keyword_marker",
"keywords": ["contoh"]
},
"indonesian_stemmer": {
"type": "stemmer",
"language": "indonesian"
}
},
"analyzer": {
"rebuilt_indonesian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"indonesian_stop",
"indonesian_keywords",
"indonesian_stemmer"
]
}
}
}
}
}
irish analyzer
editThe irish analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'irish_example',
body: {
settings: {
analysis: {
filter: {
irish_hyphenation: {
type: 'stop',
stopwords: [
'h',
'n',
't'
],
ignore_case: true
},
irish_elision: {
type: 'elision',
articles: [
'd',
'm',
'b'
],
articles_case: true
},
irish_stop: {
type: 'stop',
stopwords: '_irish_'
},
irish_lowercase: {
type: 'lowercase',
language: 'irish'
},
irish_keywords: {
type: 'keyword_marker',
keywords: [
'sampla'
]
},
irish_stemmer: {
type: 'stemmer',
language: 'irish'
}
},
analyzer: {
rebuilt_irish: {
tokenizer: 'standard',
filter: [
'irish_hyphenation',
'irish_elision',
'irish_lowercase',
'irish_stop',
'irish_keywords',
'irish_stemmer'
]
}
}
}
}
}
)
puts response
PUT /irish_example
{
"settings": {
"analysis": {
"filter": {
"irish_hyphenation": {
"type": "stop",
"stopwords": [ "h", "n", "t" ],
"ignore_case": true
},
"irish_elision": {
"type": "elision",
"articles": [ "d", "m", "b" ],
"articles_case": true
},
"irish_stop": {
"type": "stop",
"stopwords": "_irish_"
},
"irish_lowercase": {
"type": "lowercase",
"language": "irish"
},
"irish_keywords": {
"type": "keyword_marker",
"keywords": ["sampla"]
},
"irish_stemmer": {
"type": "stemmer",
"language": "irish"
}
},
"analyzer": {
"rebuilt_irish": {
"tokenizer": "standard",
"filter": [
"irish_hyphenation",
"irish_elision",
"irish_lowercase",
"irish_stop",
"irish_keywords",
"irish_stemmer"
]
}
}
}
}
}
italian analyzer
editThe italian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'italian_example',
body: {
settings: {
analysis: {
filter: {
italian_elision: {
type: 'elision',
articles: [
'c',
'l',
'all',
'dall',
'dell',
'nell',
'sull',
'coll',
'pell',
'gl',
'agl',
'dagl',
'degl',
'negl',
'sugl',
'un',
'm',
't',
's',
'v',
'd'
],
articles_case: true
},
italian_stop: {
type: 'stop',
stopwords: '_italian_'
},
italian_keywords: {
type: 'keyword_marker',
keywords: [
'esempio'
]
},
italian_stemmer: {
type: 'stemmer',
language: 'light_italian'
}
},
analyzer: {
rebuilt_italian: {
tokenizer: 'standard',
filter: [
'italian_elision',
'lowercase',
'italian_stop',
'italian_keywords',
'italian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /italian_example
{
"settings": {
"analysis": {
"filter": {
"italian_elision": {
"type": "elision",
"articles": [
"c", "l", "all", "dall", "dell",
"nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl",
"sugl", "un", "m", "t", "s", "v", "d"
],
"articles_case": true
},
"italian_stop": {
"type": "stop",
"stopwords": "_italian_"
},
"italian_keywords": {
"type": "keyword_marker",
"keywords": ["esempio"]
},
"italian_stemmer": {
"type": "stemmer",
"language": "light_italian"
}
},
"analyzer": {
"rebuilt_italian": {
"tokenizer": "standard",
"filter": [
"italian_elision",
"lowercase",
"italian_stop",
"italian_keywords",
"italian_stemmer"
]
}
}
}
}
}
latvian analyzer
editThe latvian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'latvian_example',
body: {
settings: {
analysis: {
filter: {
latvian_stop: {
type: 'stop',
stopwords: '_latvian_'
},
latvian_keywords: {
type: 'keyword_marker',
keywords: [
'piemērs'
]
},
latvian_stemmer: {
type: 'stemmer',
language: 'latvian'
}
},
analyzer: {
rebuilt_latvian: {
tokenizer: 'standard',
filter: [
'lowercase',
'latvian_stop',
'latvian_keywords',
'latvian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /latvian_example
{
"settings": {
"analysis": {
"filter": {
"latvian_stop": {
"type": "stop",
"stopwords": "_latvian_"
},
"latvian_keywords": {
"type": "keyword_marker",
"keywords": ["piemērs"]
},
"latvian_stemmer": {
"type": "stemmer",
"language": "latvian"
}
},
"analyzer": {
"rebuilt_latvian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"latvian_stop",
"latvian_keywords",
"latvian_stemmer"
]
}
}
}
}
}
lithuanian analyzer
editThe lithuanian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'lithuanian_example',
body: {
settings: {
analysis: {
filter: {
lithuanian_stop: {
type: 'stop',
stopwords: '_lithuanian_'
},
lithuanian_keywords: {
type: 'keyword_marker',
keywords: [
'pavyzdys'
]
},
lithuanian_stemmer: {
type: 'stemmer',
language: 'lithuanian'
}
},
analyzer: {
rebuilt_lithuanian: {
tokenizer: 'standard',
filter: [
'lowercase',
'lithuanian_stop',
'lithuanian_keywords',
'lithuanian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /lithuanian_example
{
"settings": {
"analysis": {
"filter": {
"lithuanian_stop": {
"type": "stop",
"stopwords": "_lithuanian_"
},
"lithuanian_keywords": {
"type": "keyword_marker",
"keywords": ["pavyzdys"]
},
"lithuanian_stemmer": {
"type": "stemmer",
"language": "lithuanian"
}
},
"analyzer": {
"rebuilt_lithuanian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"lithuanian_stop",
"lithuanian_keywords",
"lithuanian_stemmer"
]
}
}
}
}
}
norwegian analyzer
editThe norwegian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'norwegian_example',
body: {
settings: {
analysis: {
filter: {
norwegian_stop: {
type: 'stop',
stopwords: '_norwegian_'
},
norwegian_keywords: {
type: 'keyword_marker',
keywords: [
'eksempel'
]
},
norwegian_stemmer: {
type: 'stemmer',
language: 'norwegian'
}
},
analyzer: {
rebuilt_norwegian: {
tokenizer: 'standard',
filter: [
'lowercase',
'norwegian_stop',
'norwegian_keywords',
'norwegian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /norwegian_example
{
"settings": {
"analysis": {
"filter": {
"norwegian_stop": {
"type": "stop",
"stopwords": "_norwegian_"
},
"norwegian_keywords": {
"type": "keyword_marker",
"keywords": ["eksempel"]
},
"norwegian_stemmer": {
"type": "stemmer",
"language": "norwegian"
}
},
"analyzer": {
"rebuilt_norwegian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"norwegian_stop",
"norwegian_keywords",
"norwegian_stemmer"
]
}
}
}
}
}
persian analyzer
editThe persian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'persian_example',
body: {
settings: {
analysis: {
char_filter: {
zero_width_spaces: {
type: 'mapping',
mappings: [
'\\u200C=>\\u0020'
]
}
},
filter: {
persian_stop: {
type: 'stop',
stopwords: '_persian_'
}
},
analyzer: {
rebuilt_persian: {
tokenizer: 'standard',
char_filter: [
'zero_width_spaces'
],
filter: [
'lowercase',
'decimal_digit',
'arabic_normalization',
'persian_normalization',
'persian_stop'
]
}
}
}
}
}
)
puts response
PUT /persian_example
{
"settings": {
"analysis": {
"char_filter": {
"zero_width_spaces": {
"type": "mapping",
"mappings": [ "\\u200C=>\\u0020"]
}
},
"filter": {
"persian_stop": {
"type": "stop",
"stopwords": "_persian_"
}
},
"analyzer": {
"rebuilt_persian": {
"tokenizer": "standard",
"char_filter": [ "zero_width_spaces" ],
"filter": [
"lowercase",
"decimal_digit",
"arabic_normalization",
"persian_normalization",
"persian_stop"
]
}
}
}
}
}
portuguese analyzer
editThe portuguese analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'portuguese_example',
body: {
settings: {
analysis: {
filter: {
portuguese_stop: {
type: 'stop',
stopwords: '_portuguese_'
},
portuguese_keywords: {
type: 'keyword_marker',
keywords: [
'exemplo'
]
},
portuguese_stemmer: {
type: 'stemmer',
language: 'light_portuguese'
}
},
analyzer: {
rebuilt_portuguese: {
tokenizer: 'standard',
filter: [
'lowercase',
'portuguese_stop',
'portuguese_keywords',
'portuguese_stemmer'
]
}
}
}
}
}
)
puts response
PUT /portuguese_example
{
"settings": {
"analysis": {
"filter": {
"portuguese_stop": {
"type": "stop",
"stopwords": "_portuguese_"
},
"portuguese_keywords": {
"type": "keyword_marker",
"keywords": ["exemplo"]
},
"portuguese_stemmer": {
"type": "stemmer",
"language": "light_portuguese"
}
},
"analyzer": {
"rebuilt_portuguese": {
"tokenizer": "standard",
"filter": [
"lowercase",
"portuguese_stop",
"portuguese_keywords",
"portuguese_stemmer"
]
}
}
}
}
}
romanian analyzer
editThe romanian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'romanian_example',
body: {
settings: {
analysis: {
filter: {
romanian_stop: {
type: 'stop',
stopwords: '_romanian_'
},
romanian_keywords: {
type: 'keyword_marker',
keywords: [
'exemplu'
]
},
romanian_stemmer: {
type: 'stemmer',
language: 'romanian'
}
},
analyzer: {
rebuilt_romanian: {
tokenizer: 'standard',
filter: [
'lowercase',
'romanian_stop',
'romanian_keywords',
'romanian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /romanian_example
{
"settings": {
"analysis": {
"filter": {
"romanian_stop": {
"type": "stop",
"stopwords": "_romanian_"
},
"romanian_keywords": {
"type": "keyword_marker",
"keywords": ["exemplu"]
},
"romanian_stemmer": {
"type": "stemmer",
"language": "romanian"
}
},
"analyzer": {
"rebuilt_romanian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"romanian_stop",
"romanian_keywords",
"romanian_stemmer"
]
}
}
}
}
}
russian analyzer
editThe russian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'russian_example',
body: {
settings: {
analysis: {
filter: {
russian_stop: {
type: 'stop',
stopwords: '_russian_'
},
russian_keywords: {
type: 'keyword_marker',
keywords: [
'пример'
]
},
russian_stemmer: {
type: 'stemmer',
language: 'russian'
}
},
analyzer: {
rebuilt_russian: {
tokenizer: 'standard',
filter: [
'lowercase',
'russian_stop',
'russian_keywords',
'russian_stemmer'
]
}
}
}
}
}
)
puts response
PUT /russian_example
{
"settings": {
"analysis": {
"filter": {
"russian_stop": {
"type": "stop",
"stopwords": "_russian_"
},
"russian_keywords": {
"type": "keyword_marker",
"keywords": ["пример"]
},
"russian_stemmer": {
"type": "stemmer",
"language": "russian"
}
},
"analyzer": {
"rebuilt_russian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"russian_stop",
"russian_keywords",
"russian_stemmer"
]
}
}
}
}
}
serbian analyzer
editThe serbian analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'serbian_example',
body: {
settings: {
analysis: {
filter: {
serbian_stop: {
type: 'stop',
stopwords: '_serbian_'
},
serbian_keywords: {
type: 'keyword_marker',
keywords: [
'пример'
]
},
serbian_stemmer: {
type: 'stemmer',
language: 'serbian'
}
},
analyzer: {
rebuilt_serbian: {
tokenizer: 'standard',
filter: [
'lowercase',
'serbian_stop',
'serbian_keywords',
'serbian_stemmer',
'serbian_normalization'
]
}
}
}
}
}
)
puts response
PUT /serbian_example
{
"settings": {
"analysis": {
"filter": {
"serbian_stop": {
"type": "stop",
"stopwords": "_serbian_"
},
"serbian_keywords": {
"type": "keyword_marker",
"keywords": ["пример"]
},
"serbian_stemmer": {
"type": "stemmer",
"language": "serbian"
}
},
"analyzer": {
"rebuilt_serbian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"serbian_stop",
"serbian_keywords",
"serbian_stemmer",
"serbian_normalization"
]
}
}
}
}
}
sorani analyzer
editThe sorani analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'sorani_example',
body: {
settings: {
analysis: {
filter: {
sorani_stop: {
type: 'stop',
stopwords: '_sorani_'
},
sorani_keywords: {
type: 'keyword_marker',
keywords: [
'mînak'
]
},
sorani_stemmer: {
type: 'stemmer',
language: 'sorani'
}
},
analyzer: {
rebuilt_sorani: {
tokenizer: 'standard',
filter: [
'sorani_normalization',
'lowercase',
'decimal_digit',
'sorani_stop',
'sorani_keywords',
'sorani_stemmer'
]
}
}
}
}
}
)
puts response
PUT /sorani_example
{
"settings": {
"analysis": {
"filter": {
"sorani_stop": {
"type": "stop",
"stopwords": "_sorani_"
},
"sorani_keywords": {
"type": "keyword_marker",
"keywords": ["mînak"]
},
"sorani_stemmer": {
"type": "stemmer",
"language": "sorani"
}
},
"analyzer": {
"rebuilt_sorani": {
"tokenizer": "standard",
"filter": [
"sorani_normalization",
"lowercase",
"decimal_digit",
"sorani_stop",
"sorani_keywords",
"sorani_stemmer"
]
}
}
}
}
}
spanish analyzer
editThe spanish analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'spanish_example',
body: {
settings: {
analysis: {
filter: {
spanish_stop: {
type: 'stop',
stopwords: '_spanish_'
},
spanish_keywords: {
type: 'keyword_marker',
keywords: [
'ejemplo'
]
},
spanish_stemmer: {
type: 'stemmer',
language: 'light_spanish'
}
},
analyzer: {
rebuilt_spanish: {
tokenizer: 'standard',
filter: [
'lowercase',
'spanish_stop',
'spanish_keywords',
'spanish_stemmer'
]
}
}
}
}
}
)
puts response
PUT /spanish_example
{
"settings": {
"analysis": {
"filter": {
"spanish_stop": {
"type": "stop",
"stopwords": "_spanish_"
},
"spanish_keywords": {
"type": "keyword_marker",
"keywords": ["ejemplo"]
},
"spanish_stemmer": {
"type": "stemmer",
"language": "light_spanish"
}
},
"analyzer": {
"rebuilt_spanish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"spanish_stop",
"spanish_keywords",
"spanish_stemmer"
]
}
}
}
}
}
swedish analyzer
editThe swedish analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'swedish_example',
body: {
settings: {
analysis: {
filter: {
swedish_stop: {
type: 'stop',
stopwords: '_swedish_'
},
swedish_keywords: {
type: 'keyword_marker',
keywords: [
'exempel'
]
},
swedish_stemmer: {
type: 'stemmer',
language: 'swedish'
}
},
analyzer: {
rebuilt_swedish: {
tokenizer: 'standard',
filter: [
'lowercase',
'swedish_stop',
'swedish_keywords',
'swedish_stemmer'
]
}
}
}
}
}
)
puts response
PUT /swedish_example
{
"settings": {
"analysis": {
"filter": {
"swedish_stop": {
"type": "stop",
"stopwords": "_swedish_"
},
"swedish_keywords": {
"type": "keyword_marker",
"keywords": ["exempel"]
},
"swedish_stemmer": {
"type": "stemmer",
"language": "swedish"
}
},
"analyzer": {
"rebuilt_swedish": {
"tokenizer": "standard",
"filter": [
"lowercase",
"swedish_stop",
"swedish_keywords",
"swedish_stemmer"
]
}
}
}
}
}
turkish analyzer
editThe turkish analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'turkish_example',
body: {
settings: {
analysis: {
filter: {
turkish_stop: {
type: 'stop',
stopwords: '_turkish_'
},
turkish_lowercase: {
type: 'lowercase',
language: 'turkish'
},
turkish_keywords: {
type: 'keyword_marker',
keywords: [
'örnek'
]
},
turkish_stemmer: {
type: 'stemmer',
language: 'turkish'
}
},
analyzer: {
rebuilt_turkish: {
tokenizer: 'standard',
filter: [
'apostrophe',
'turkish_lowercase',
'turkish_stop',
'turkish_keywords',
'turkish_stemmer'
]
}
}
}
}
}
)
puts response
PUT /turkish_example
{
"settings": {
"analysis": {
"filter": {
"turkish_stop": {
"type": "stop",
"stopwords": "_turkish_"
},
"turkish_lowercase": {
"type": "lowercase",
"language": "turkish"
},
"turkish_keywords": {
"type": "keyword_marker",
"keywords": ["örnek"]
},
"turkish_stemmer": {
"type": "stemmer",
"language": "turkish"
}
},
"analyzer": {
"rebuilt_turkish": {
"tokenizer": "standard",
"filter": [
"apostrophe",
"turkish_lowercase",
"turkish_stop",
"turkish_keywords",
"turkish_stemmer"
]
}
}
}
}
}
thai analyzer
editThe thai analyzer could be reimplemented as a custom analyzer as follows:
response = client.indices.create(
index: 'thai_example',
body: {
settings: {
analysis: {
filter: {
thai_stop: {
type: 'stop',
stopwords: '_thai_'
}
},
analyzer: {
rebuilt_thai: {
tokenizer: 'thai',
filter: [
'lowercase',
'decimal_digit',
'thai_stop'
]
}
}
}
}
}
)
puts response