// common/src/search.rs use tantivy::schema::*; use tantivy::tokenizer::*; use tantivy::Index; /// Creates a hybrid Slovak search schema with optimized prefix fields. pub fn create_search_schema() -> Schema { let mut schema_builder = Schema::builder(); schema_builder.add_u64_field("pg_id", INDEXED | STORED); // FIELD 1: For prefixes (1-4 chars). let short_prefix_indexing = TextFieldIndexing::default() .set_tokenizer("slovak_prefix_edge") .set_index_option(IndexRecordOption::WithFreqsAndPositions); let short_prefix_options = TextOptions::default() .set_indexing_options(short_prefix_indexing) .set_stored(); schema_builder.add_text_field("prefix_edge", short_prefix_options); // FIELD 2: For the full word. let full_word_indexing = TextFieldIndexing::default() .set_tokenizer("slovak_prefix_full") .set_index_option(IndexRecordOption::WithFreqsAndPositions); let full_word_options = TextOptions::default() .set_indexing_options(full_word_indexing) .set_stored(); schema_builder.add_text_field("prefix_full", full_word_options); // NGRAM FIELD: For substring matching. let ngram_field_indexing = TextFieldIndexing::default() .set_tokenizer("slovak_ngram") .set_index_option(IndexRecordOption::WithFreqsAndPositions); let ngram_options = TextOptions::default() .set_indexing_options(ngram_field_indexing) .set_stored(); schema_builder.add_text_field("text_ngram", ngram_options); schema_builder.build() } /// Registers all necessary Slovak tokenizers with the index. /// /// This must be called by ANY process that opens the index /// to ensure the tokenizers are loaded into memory. pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> { let tokenizer_manager = index.tokenizers(); // TOKENIZER for `prefix_edge`: Edge N-gram (1-4 chars) let edge_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 4, true)?) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(AsciiFoldingFilter) .build(); tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer); // TOKENIZER for `prefix_full`: Simple word tokenizer let full_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(AsciiFoldingFilter) .build(); tokenizer_manager.register("slovak_prefix_full", full_tokenizer); // NGRAM TOKENIZER: For substring matching. let ngram_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) .filter(AsciiFoldingFilter) .build(); tokenizer_manager.register("slovak_ngram", ngram_tokenizer); Ok(()) }