use std::path::{Path, PathBuf}; use tantivy::schema::{ Field, IndexRecordOption, JsonObjectOptions, Schema, Term, TextFieldIndexing, TextOptions, INDEXED, STORED, STRING, }; use tantivy::tokenizer::{ AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenStream, }; use tantivy::Index; pub const F_PG_ID: &str = "pg_id"; pub const F_TABLE_NAME: &str = "table_name"; pub const F_ROW_KEY: &str = "row_key"; pub const F_ALL_TEXT: &str = "all_text"; pub const F_DATA_WORD: &str = "data_word"; pub const F_DATA_NGRAM: &str = "data_ngram"; pub const F_DATA_EXACT: &str = "data_exact"; pub const TOK_WORD: &str = "kw_word"; pub const TOK_NGRAM: &str = "kw_ngram"; pub const TOK_EXACT: &str = "kw_exact"; /// Returns the on-disk path for a profile search index. pub fn search_index_path(root: &Path, profile_name: &str) -> PathBuf { root.join(profile_name) } /// Returns the unique index key for one table row inside a profile index. pub fn search_row_key(table_name: &str, row_id: i64) -> String { format!("{}:{}", table_name, row_id) } /// Normalizes user-entered values for exact-mode terms. pub fn normalize_exact(input: &str) -> String { let trimmed = input.trim(); if trimmed.is_empty() { return String::new(); } let mut analyzer = exact_analyzer(); let mut stream = analyzer.token_stream(trimmed); let mut out = String::with_capacity(trimmed.len()); while let Some(token) = stream.next() { out.push_str(&token.text); } out } /// Normalizes a column name to the JSON-key form used at index time. pub fn normalize_column_name(column: &str) -> String { column.to_ascii_lowercase() } /// Creates the column-aware search schema. pub fn create_search_schema() -> Schema { let mut schema_builder = Schema::builder(); schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED); schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED); schema_builder.add_text_field(F_ROW_KEY, STRING | STORED); schema_builder.add_text_field(F_ALL_TEXT, text_options(TOK_WORD)); schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false)); schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false)); schema_builder.add_json_field(F_DATA_EXACT, json_options(TOK_EXACT, false, false)); schema_builder.build() } fn text_options(tokenizer_name: &str) -> TextOptions { let indexing = TextFieldIndexing::default() .set_tokenizer(tokenizer_name) .set_index_option(IndexRecordOption::WithFreqsAndPositions); TextOptions::default().set_indexing_options(indexing) } fn json_options(tokenizer_name: &str, with_positions: bool, stored: bool) -> JsonObjectOptions { let index_option = if with_positions { IndexRecordOption::WithFreqsAndPositions } else { IndexRecordOption::Basic }; let indexing = TextFieldIndexing::default() .set_tokenizer(tokenizer_name) .set_index_option(index_option); let mut options = JsonObjectOptions::default().set_indexing_options(indexing); if stored { options = options.set_stored(); } options } /// Registers all required tokenizers with the index. pub fn register_tokenizers(index: &Index) -> tantivy::Result<()> { let tokenizer_manager = index.tokenizers(); tokenizer_manager.register(TOK_WORD, word_analyzer()); tokenizer_manager.register(TOK_NGRAM, ngram_analyzer()?); tokenizer_manager.register(TOK_EXACT, exact_analyzer()); Ok(()) } fn word_analyzer() -> TextAnalyzer { TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(80)) .filter(LowerCaser) .filter(AsciiFoldingFilter) .build() } fn ngram_analyzer() -> tantivy::Result { Ok(TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?) .filter(RemoveLongFilter::limit(80)) .filter(LowerCaser) .filter(AsciiFoldingFilter) .build()) } fn exact_analyzer() -> TextAnalyzer { TextAnalyzer::builder(RawTokenizer::default()) .filter(LowerCaser) .filter(AsciiFoldingFilter) .build() } /// Tokenizes text the same way `data_word` is indexed. pub fn tokenize_word(text: &str) -> Vec { tokenize_with(word_analyzer(), text) } /// Tokenizes text the same way `data_ngram` is indexed. pub fn tokenize_ngram(text: &str) -> Vec { match ngram_analyzer() { Ok(analyzer) => tokenize_with(analyzer, text), Err(_) => Vec::new(), } } fn tokenize_with(mut analyzer: TextAnalyzer, text: &str) -> Vec { let mut stream = analyzer.token_stream(text); let mut out = Vec::new(); while let Some(token) = stream.next() { out.push(token.text.clone()); } out } /// Builds a term scoped to a specific JSON path within a JSON field. pub fn json_path_term(field: Field, column: &str, text: &str) -> Term { let mut term = Term::from_field_json_path(field, column, false); term.append_type_and_str(text); term } /// Returns all required schema fields or fails loudly on mismatch. pub struct SchemaFields { pub pg_id: Field, pub table_name: Field, pub row_key: Field, pub all_text: Field, pub data_word: Field, pub data_ngram: Field, pub data_exact: Field, } impl SchemaFields { pub fn from(schema: &Schema) -> tantivy::Result { Ok(Self { pg_id: get_field(schema, F_PG_ID)?, table_name: get_field(schema, F_TABLE_NAME)?, row_key: get_field(schema, F_ROW_KEY)?, all_text: get_field(schema, F_ALL_TEXT)?, data_word: get_field(schema, F_DATA_WORD)?, data_ngram: get_field(schema, F_DATA_NGRAM)?, data_exact: get_field(schema, F_DATA_EXACT)?, }) } } fn get_field(schema: &Schema, name: &str) -> tantivy::Result { schema.get_field(name).map_err(|e| { tantivy::TantivyError::SchemaError(format!("schema is missing field '{name}': {e}")) }) }