diff --git a/Cargo.lock b/Cargo.lock index a19c193..9ffe175 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -488,6 +488,7 @@ version = "0.3.13" dependencies = [ "prost", "serde", + "tantivy", "tonic", "tonic-build", ] diff --git a/common/Cargo.toml b/common/Cargo.toml index 7ff5c21..e5f8b7b 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -9,5 +9,8 @@ tonic = "0.13.0" prost = "0.13.5" serde = { version = "1.0.219", features = ["derive"] } +# Search +tantivy = { workspace = true } + [build-dependencies] tonic-build = "0.13.0" diff --git a/common/src/lib.rs b/common/src/lib.rs index 6ff57ee..c35e8a1 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -1,4 +1,7 @@ // common/src/lib.rs + +pub mod search; + pub mod proto { pub mod multieko2 { pub mod adresar { diff --git a/common/src/search.rs b/common/src/search.rs new file mode 100644 index 0000000..35d98ae --- /dev/null +++ b/common/src/search.rs @@ -0,0 +1,78 @@ +// common/src/search.rs + +use tantivy::schema::*; +use tantivy::tokenizer::*; +use tantivy::Index; + +/// Creates a hybrid Slovak search schema with optimized prefix fields. +pub fn create_search_schema() -> Schema { + let mut schema_builder = Schema::builder(); + + schema_builder.add_u64_field("pg_id", INDEXED | STORED); + + // FIELD 1: For prefixes (1-15 chars). + let short_prefix_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak_prefix_edge") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let short_prefix_options = TextOptions::default() + .set_indexing_options(short_prefix_indexing) + .set_stored(); + schema_builder.add_text_field("prefix_edge", short_prefix_options); + + // FIELD 2: For the full word. + let full_word_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak_prefix_full") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let full_word_options = TextOptions::default() + .set_indexing_options(full_word_indexing) + .set_stored(); + schema_builder.add_text_field("prefix_full", full_word_options); + + // NGRAM FIELD: For substring matching. + let ngram_field_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak_ngram") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let ngram_options = TextOptions::default() + .set_indexing_options(ngram_field_indexing) + .set_stored(); + schema_builder.add_text_field("text_ngram", ngram_options); + + schema_builder.build() +} + +/// Registers all necessary Slovak tokenizers with the index. +/// +/// This must be called by ANY process that opens the index +/// to ensure the tokenizers are loaded into memory. +pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> { + let tokenizer_manager = index.tokenizers(); + + // TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars) + let edge_tokenizer = + TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer); + + // TOKENIZER for `prefix_full`: Simple word tokenizer + let full_tokenizer = + TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_prefix_full", full_tokenizer); + + // NGRAM TOKENIZER: For substring matching. + let ngram_tokenizer = + TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_ngram", ngram_tokenizer); + + Ok(()) +} diff --git a/search/src/lib.rs b/search/src/lib.rs index 9df77ba..2d7049f 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -15,6 +15,7 @@ use common::proto::multieko2::search::{ }; pub use common::proto::multieko2::search::searcher_server::SearcherServer; use common::proto::multieko2::search::searcher_server::Searcher; +use common::search::register_slovak_tokenizers; use tantivy::schema::Value; pub struct SearcherService; @@ -217,41 +218,3 @@ impl Searcher for SearcherService { } } -/// This function is now an exact mirror of the one in `server/src/search_schema.rs` -fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> { - use tantivy::tokenizer::*; - - let tokenizer_manager = index.tokenizers(); - - // TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars) - if tokenizer_manager.get("slovak_prefix_edge").is_none() { - let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); - tokenizer_manager.register("slovak_prefix_edge", tokenizer); - } - - // TOKENIZER for `prefix_full`: Simple word tokenizer - if tokenizer_manager.get("slovak_prefix_full").is_none() { - let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); - tokenizer_manager.register("slovak_prefix_full", tokenizer); - } - - // NGRAM TOKENIZER: For substring matching. - if tokenizer_manager.get("slovak_ngram").is_none() { - let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); - tokenizer_manager.register("slovak_ngram", tokenizer); - } - - Ok(()) -} diff --git a/server/src/search_schema.rs b/server/src/search_schema.rs index d158619..6cbc12f 100644 --- a/server/src/search_schema.rs +++ b/server/src/search_schema.rs @@ -1,97 +1,26 @@ // server/src/search_schema.rs -use tantivy::schema::*; -use tantivy::tokenizer::*; -use tantivy::Index; use std::path::Path; +use tantivy::Index; -/// Creates a hybrid Slovak search schema with optimized prefix fields. -pub fn create_search_schema() -> Schema { - let mut schema_builder = Schema::builder(); - - schema_builder.add_u64_field("pg_id", INDEXED | STORED); - - // FIELD 1: For prefixes (1-15 chars). - let short_prefix_indexing = TextFieldIndexing::default() - .set_tokenizer("slovak_prefix_edge") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let short_prefix_options = TextOptions::default() - .set_indexing_options(short_prefix_indexing) - .set_stored(); - schema_builder.add_text_field("prefix_edge", short_prefix_options); - - // FIELD 2: For the full word. - let full_word_indexing = TextFieldIndexing::default() - .set_tokenizer("slovak_prefix_full") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let full_word_options = TextOptions::default() - .set_indexing_options(full_word_indexing) - .set_stored(); - schema_builder.add_text_field("prefix_full", full_word_options); - - // NGRAM FIELD: For substring matching. - let ngram_field_indexing = TextFieldIndexing::default() - .set_tokenizer("slovak_ngram") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let ngram_options = TextOptions::default() - .set_indexing_options(ngram_field_indexing) - .set_stored(); - schema_builder.add_text_field("text_ngram", ngram_options); - - schema_builder.build() -} - -/// Registers all necessary Slovak tokenizers with the index. -pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> { - let tokenizer_manager = index.tokenizers(); - - // TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars) - if tokenizer_manager.get("slovak_prefix_edge").is_none() { - // YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit. - let tokenizer = TextAnalyzer::builder( - NgramTokenizer::new(1, 15, true)? - ) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); - tokenizer_manager.register("slovak_prefix_edge", tokenizer); - } - - // TOKENIZER for `prefix_full`: Simple word tokenizer - if tokenizer_manager.get("slovak_prefix_full").is_none() { - let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); - tokenizer_manager.register("slovak_prefix_full", tokenizer); - } - - // NGRAM TOKENIZER: For substring matching. - if tokenizer_manager.get("slovak_ngram").is_none() { - let tokenizer = TextAnalyzer::builder( - NgramTokenizer::new(3, 3, false)? - ) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); - tokenizer_manager.register("slovak_ngram", tokenizer); - } - - Ok(()) -} +// Re-export the functions from the common crate. +// This makes them available as `crate::search_schema::create_search_schema`, etc. +pub use common::search::{create_search_schema, register_slovak_tokenizers}; +/// Gets an existing index or creates a new one. +/// This function now uses the shared logic from the `common` crate. pub fn get_or_create_index(table_name: &str) -> tantivy::Result { let index_path = Path::new("./tantivy_indexes").join(table_name); std::fs::create_dir_all(&index_path)?; + let index = if index_path.join("meta.json").exists() { Index::open_in_dir(&index_path)? } else { let schema = create_search_schema(); Index::create_in_dir(&index_path, schema)? }; - register_slovak_tokenizer(&index)?; + + // This now calls the single, authoritative function from `common`. + register_slovak_tokenizers(&index)?; Ok(index) }