191 lines
6.0 KiB
Rust
191 lines
6.0 KiB
Rust
use std::path::{Path, PathBuf};
|
|
|
|
use tantivy::schema::{
|
|
Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions,
|
|
INDEXED, STORED, STRING,
|
|
};
|
|
use tantivy::tokenizer::{
|
|
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
|
|
SimpleTokenizer, TextAnalyzer, TokenStream,
|
|
};
|
|
use tantivy::Index;
|
|
|
|
pub const F_PG_ID: &str = "pg_id";
|
|
pub const F_TABLE_NAME: &str = "table_name";
|
|
pub const F_ROW_KEY: &str = "row_key";
|
|
pub const F_ALL_TEXT: &str = "all_text";
|
|
pub const F_DATA_WORD: &str = "data_word";
|
|
pub const F_DATA_NGRAM: &str = "data_ngram";
|
|
pub const F_DATA_EXACT: &str = "data_exact";
|
|
|
|
pub const TOK_WORD: &str = "kw_word";
|
|
pub const TOK_NGRAM: &str = "kw_ngram";
|
|
pub const TOK_EXACT: &str = "kw_exact";
|
|
|
|
/// Returns the on-disk path for a profile search index.
|
|
pub fn search_index_path(root: &Path, profile_name: &str) -> PathBuf {
|
|
root.join(profile_name)
|
|
}
|
|
|
|
/// Returns the unique index key for one table row inside a profile index.
|
|
pub fn search_row_key(table_name: &str, row_id: i64) -> String {
|
|
format!("{}:{}", table_name, row_id)
|
|
}
|
|
|
|
/// Normalizes user-entered values for exact-mode terms.
|
|
pub fn normalize_exact(input: &str) -> String {
|
|
let trimmed = input.trim();
|
|
if trimmed.is_empty() {
|
|
return String::new();
|
|
}
|
|
|
|
let mut analyzer = exact_analyzer();
|
|
let mut stream = analyzer.token_stream(trimmed);
|
|
let mut out = String::with_capacity(trimmed.len());
|
|
while let Some(token) = stream.next() {
|
|
out.push_str(&token.text);
|
|
}
|
|
out
|
|
}
|
|
|
|
/// Normalizes a column name to the JSON-key form used at index time.
|
|
pub fn normalize_column_name(column: &str) -> String {
|
|
column.to_ascii_lowercase()
|
|
}
|
|
|
|
/// Creates the column-aware search schema.
|
|
pub fn create_search_schema() -> Schema {
|
|
let mut schema_builder = Schema::builder();
|
|
|
|
schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
|
|
schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
|
|
schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
|
|
schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD));
|
|
|
|
schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
|
|
schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
|
|
schema_builder.add_json_field(F_DATA_EXACT, json_options(TOK_EXACT, false, false));
|
|
|
|
schema_builder.build()
|
|
}
|
|
|
|
fn text_options(tokenizer_name: &str) -> TextOptions {
|
|
let indexing = TextFieldIndexing::default()
|
|
.set_tokenizer(tokenizer_name)
|
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
|
|
|
TextOptions::default().set_indexing_options(indexing)
|
|
}
|
|
|
|
fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions {
|
|
let index_option = if with_positions {
|
|
IndexRecordOption::WithFreqsAndPositions
|
|
} else {
|
|
IndexRecordOption::Basic
|
|
};
|
|
|
|
let indexing = TextFieldIndexing::default()
|
|
.set_tokenizer(tokenizer_name)
|
|
.set_index_option(index_option);
|
|
|
|
let mut options = JsonObjectOptions::default().set_indexing_options(indexing);
|
|
if stored {
|
|
options = options.set_stored();
|
|
}
|
|
options
|
|
}
|
|
|
|
/// Registers all required tokenizers with the index.
|
|
pub fn register_tokenizers(index: &Index) -> tantivy::Result<()> {
|
|
let tokenizer_manager = index.tokenizers();
|
|
|
|
tokenizer_manager.register(TOK_WORD, word_analyzer());
|
|
tokenizer_manager.register(TOK_NGRAM, ngram_analyzer()?);
|
|
tokenizer_manager.register(TOK_EXACT, exact_analyzer());
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn word_analyzer() -> TextAnalyzer {
|
|
TextAnalyzer::builder(SimpleTokenizer::default())
|
|
.filter(RemoveLongFilter::limit(80))
|
|
.filter(LowerCaser)
|
|
.filter(AsciiFoldingFilter)
|
|
.build()
|
|
}
|
|
|
|
fn ngram_analyzer() -> tantivy::Result<TextAnalyzer> {
|
|
Ok(TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
|
.filter(RemoveLongFilter::limit(80))
|
|
.filter(LowerCaser)
|
|
.filter(AsciiFoldingFilter)
|
|
.build())
|
|
}
|
|
|
|
fn exact_analyzer() -> TextAnalyzer {
|
|
TextAnalyzer::builder(RawTokenizer::default())
|
|
.filter(LowerCaser)
|
|
.filter(AsciiFoldingFilter)
|
|
.build()
|
|
}
|
|
|
|
/// Tokenizes text the same way `data_word` is indexed.
|
|
pub fn tokenize_word(text: &str) -> Vec<String> {
|
|
tokenize_with(word_analyzer(), text)
|
|
}
|
|
|
|
/// Tokenizes text the same way `data_ngram` is indexed.
|
|
pub fn tokenize_ngram(text: &str) -> Vec<String> {
|
|
match ngram_analyzer() {
|
|
Ok(analyzer) => tokenize_with(analyzer, text),
|
|
Err(_) => Vec::new(),
|
|
}
|
|
}
|
|
|
|
fn tokenize_with(mut analyzer: TextAnalyzer, text: &str) -> Vec<String> {
|
|
let mut stream = analyzer.token_stream(text);
|
|
let mut out = Vec::new();
|
|
while let Some(token) = stream.next() {
|
|
out.push(token.text.clone());
|
|
}
|
|
out
|
|
}
|
|
|
|
/// Builds a term scoped to a specific JSON path within a JSON field.
|
|
pub fn json_path_term(field: Field, column: &str, text: &str) -> Term {
|
|
let mut term = Term::from_field_json_path(field, column, false);
|
|
term.append_type_and_str(text);
|
|
term
|
|
}
|
|
|
|
/// Returns all required schema fields or fails loudly on mismatch.
|
|
pub struct SchemaFields {
|
|
pub pg_id: Field,
|
|
pub table_name: Field,
|
|
pub row_key: Field,
|
|
pub all_text: Field,
|
|
pub data_word: Field,
|
|
pub data_ngram: Field,
|
|
pub data_exact: Field,
|
|
}
|
|
|
|
impl SchemaFields {
|
|
pub fn from(schema: &Schema) -> tantivy::Result<Self> {
|
|
Ok(Self {
|
|
pg_id: get_field(schema, F_PG_ID)?,
|
|
table_name: get_field(schema, F_TABLE_NAME)?,
|
|
row_key: get_field(schema, F_ROW_KEY)?,
|
|
all_text: get_field(schema, F_ALL_TEXT)?,
|
|
data_word: get_field(schema, F_DATA_WORD)?,
|
|
data_ngram: get_field(schema, F_DATA_NGRAM)?,
|
|
data_exact: get_field(schema, F_DATA_EXACT)?,
|
|
})
|
|
}
|
|
}
|
|
|
|
fn get_field(schema: &Schema, name: &str) -> tantivy::Result<Field> {
|
|
schema.get_field(name).map_err(|e| {
|
|
tantivy::TantivyError::SchemaError(format!("schema is missing field '{name}': {e}"))
|
|
})
|
|
}
|