Fingerprint token filter
editFingerprint token filter
editSorts and removes duplicate tokens from a token stream, then concatenates the stream into a single output token.
For example, this filter changes the [ the, fox, was, very, very, quick ]
token stream as follows:
-
Sorts the tokens alphabetically to
[ fox, quick, the, very, very, was ] -
Removes a duplicate instance of the
verytoken. -
Concatenates the token stream to a output single token:
[fox quick the very was ]
Output tokens produced by this filter are useful for fingerprinting and clustering a body of text as described in the OpenRefine project.
This filter uses Lucene’s FingerprintFilter.
Example
editThe following analyze API request uses the fingerprint
filter to create a single output token for the text zebra jumps over resting
resting dog:
resp = client.indices.analyze(
tokenizer="whitespace",
filter=[
"fingerprint"
],
text="zebra jumps over resting resting dog",
)
print(resp)
response = client.indices.analyze(
body: {
tokenizer: 'whitespace',
filter: [
'fingerprint'
],
text: 'zebra jumps over resting resting dog'
}
)
puts response
const response = await client.indices.analyze({
tokenizer: "whitespace",
filter: ["fingerprint"],
text: "zebra jumps over resting resting dog",
});
console.log(response);
GET _analyze
{
"tokenizer" : "whitespace",
"filter" : ["fingerprint"],
"text" : "zebra jumps over resting resting dog"
}
The filter produces the following token:
[ dog jumps over resting zebra ]
Add to an analyzer
editThe following create index API request uses the
fingerprint filter to configure a new custom
analyzer.
resp = client.indices.create(
index="fingerprint_example",
settings={
"analysis": {
"analyzer": {
"whitespace_fingerprint": {
"tokenizer": "whitespace",
"filter": [
"fingerprint"
]
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'fingerprint_example',
body: {
settings: {
analysis: {
analyzer: {
whitespace_fingerprint: {
tokenizer: 'whitespace',
filter: [
'fingerprint'
]
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "fingerprint_example",
settings: {
analysis: {
analyzer: {
whitespace_fingerprint: {
tokenizer: "whitespace",
filter: ["fingerprint"],
},
},
},
},
});
console.log(response);
PUT fingerprint_example
{
"settings": {
"analysis": {
"analyzer": {
"whitespace_fingerprint": {
"tokenizer": "whitespace",
"filter": [ "fingerprint" ]
}
}
}
}
}
Configurable parameters
editCustomize
editTo customize the fingerprint filter, duplicate it to create the basis
for a new custom token filter. You can modify the filter using its configurable
parameters.
For example, the following request creates a custom fingerprint filter with
that use + to concatenate token streams. The filter also limits
output tokens to 100 characters or fewer.
resp = client.indices.create(
index="custom_fingerprint_example",
settings={
"analysis": {
"analyzer": {
"whitespace_": {
"tokenizer": "whitespace",
"filter": [
"fingerprint_plus_concat"
]
}
},
"filter": {
"fingerprint_plus_concat": {
"type": "fingerprint",
"max_output_size": 100,
"separator": "+"
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'custom_fingerprint_example',
body: {
settings: {
analysis: {
analyzer: {
whitespace_: {
tokenizer: 'whitespace',
filter: [
'fingerprint_plus_concat'
]
}
},
filter: {
fingerprint_plus_concat: {
type: 'fingerprint',
max_output_size: 100,
separator: '+'
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "custom_fingerprint_example",
settings: {
analysis: {
analyzer: {
whitespace_: {
tokenizer: "whitespace",
filter: ["fingerprint_plus_concat"],
},
},
filter: {
fingerprint_plus_concat: {
type: "fingerprint",
max_output_size: 100,
separator: "+",
},
},
},
},
});
console.log(response);
PUT custom_fingerprint_example
{
"settings": {
"analysis": {
"analyzer": {
"whitespace_": {
"tokenizer": "whitespace",
"filter": [ "fingerprint_plus_concat" ]
}
},
"filter": {
"fingerprint_plus_concat": {
"type": "fingerprint",
"max_output_size": 100,
"separator": "+"
}
}
}
}
}