search in common module, now fixing layer mixing issue
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
// common/src/lib.rs
|
||||
|
||||
pub mod search;
|
||||
|
||||
pub mod proto {
|
||||
pub mod multieko2 {
|
||||
pub mod adresar {
|
||||
|
||||
78
common/src/search.rs
Normal file
78
common/src/search.rs
Normal file
@@ -0,0 +1,78 @@
|
||||
// common/src/search.rs
|
||||
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
|
||||
/// Creates a hybrid Slovak search schema with optimized prefix fields.
|
||||
pub fn create_search_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||
|
||||
// FIELD 1: For prefixes (1-15 chars).
|
||||
let short_prefix_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak_prefix_edge")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let short_prefix_options = TextOptions::default()
|
||||
.set_indexing_options(short_prefix_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("prefix_edge", short_prefix_options);
|
||||
|
||||
// FIELD 2: For the full word.
|
||||
let full_word_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak_prefix_full")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let full_word_options = TextOptions::default()
|
||||
.set_indexing_options(full_word_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("prefix_full", full_word_options);
|
||||
|
||||
// NGRAM FIELD: For substring matching.
|
||||
let ngram_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak_ngram")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let ngram_options = TextOptions::default()
|
||||
.set_indexing_options(ngram_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("text_ngram", ngram_options);
|
||||
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
/// Registers all necessary Slovak tokenizers with the index.
|
||||
///
|
||||
/// This must be called by ANY process that opens the index
|
||||
/// to ensure the tokenizers are loaded into memory.
|
||||
pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
|
||||
let tokenizer_manager = index.tokenizers();
|
||||
|
||||
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
||||
let edge_tokenizer =
|
||||
TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer);
|
||||
|
||||
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
||||
let full_tokenizer =
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_prefix_full", full_tokenizer);
|
||||
|
||||
// NGRAM TOKENIZER: For substring matching.
|
||||
let ngram_tokenizer =
|
||||
TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_ngram", ngram_tokenizer);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user