search in common module, now fixing layer mixing issue

This commit is contained in:
filipriec
2025-06-10 13:47:18 +02:00
parent 350c522d19
commit 679bb3b6ab
6 changed files with 96 additions and 119 deletions

View File

@@ -1,97 +1,26 @@
// server/src/search_schema.rs
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::Index;
use std::path::Path;
use tantivy::Index;
/// Creates a hybrid Slovak search schema with optimized prefix fields.
pub fn create_search_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
// FIELD 1: For prefixes (1-15 chars).
let short_prefix_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_edge")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let short_prefix_options = TextOptions::default()
.set_indexing_options(short_prefix_indexing)
.set_stored();
schema_builder.add_text_field("prefix_edge", short_prefix_options);
// FIELD 2: For the full word.
let full_word_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_full")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let full_word_options = TextOptions::default()
.set_indexing_options(full_word_indexing)
.set_stored();
schema_builder.add_text_field("prefix_full", full_word_options);
// NGRAM FIELD: For substring matching.
let ngram_field_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_ngram")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let ngram_options = TextOptions::default()
.set_indexing_options(ngram_field_indexing)
.set_stored();
schema_builder.add_text_field("text_ngram", ngram_options);
schema_builder.build()
}
/// Registers all necessary Slovak tokenizers with the index.
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
let tokenizer_manager = index.tokenizers();
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
// YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit.
let tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(1, 15, true)?
)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
}
// TOKENIZER for `prefix_full`: Simple word tokenizer
if tokenizer_manager.get("slovak_prefix_full").is_none() {
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_full", tokenizer);
}
// NGRAM TOKENIZER: For substring matching.
if tokenizer_manager.get("slovak_ngram").is_none() {
let tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(3, 3, false)?
)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_ngram", tokenizer);
}
Ok(())
}
// Re-export the functions from the common crate.
// This makes them available as `crate::search_schema::create_search_schema`, etc.
pub use common::search::{create_search_schema, register_slovak_tokenizers};
/// Gets an existing index or creates a new one.
/// This function now uses the shared logic from the `common` crate.
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
let index_path = Path::new("./tantivy_indexes").join(table_name);
std::fs::create_dir_all(&index_path)?;
let index = if index_path.join("meta.json").exists() {
Index::open_in_dir(&index_path)?
} else {
let schema = create_search_schema();
Index::create_in_dir(&index_path, schema)?
};
register_slovak_tokenizer(&index)?;
// This now calls the single, authoritative function from `common`.
register_slovak_tokenizers(&index)?;
Ok(index)
}