Files
komp_ac/server/src/search_schema.rs
2025-06-09 16:36:18 +02:00

64 lines
2.1 KiB
Rust

// server/src/search_schema.rs
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::Index;
use std::path::Path;
/// Creates a Tantivy schema optimized for Slovak ngram search
pub fn create_search_schema() -> Schema {
let mut schema_builder = Schema::builder();
// ID field to link back to PostgreSQL
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
// Slovak text field with ngram tokenizer for search-as-you-type
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak") // KEEP THE SAME NAME
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("text_sk", text_options);
schema_builder.build()
}
/// Registers the Slovak ngram tokenizer with the index
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
let tokenizer_manager = index.tokenizers();
// Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
let slovak_ngram_tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
)
.filter(RemoveLongFilter::limit(40)) // Remove very long tokens
.filter(LowerCaser) // Convert to lowercase
.filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
.build();
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
Ok(())
}
/// Gets or creates an index for a table with proper Slovak ngram processing
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
let index_path = Path::new("./tantivy_indexes").join(table_name);
std::fs::create_dir_all(&index_path)?;
let index = if index_path.join("meta.json").exists() {
Index::open_in_dir(&index_path)?
} else {
let schema = create_search_schema();
Index::create_in_dir(&index_path, schema)?
};
// Always register the tokenizer when opening
register_slovak_tokenizer(&index)?;
Ok(index)
}