64 lines
2.1 KiB
Rust
64 lines
2.1 KiB
Rust
// server/src/search_schema.rs
|
|
|
|
use tantivy::schema::*;
|
|
use tantivy::tokenizer::*;
|
|
use tantivy::Index;
|
|
use std::path::Path;
|
|
|
|
/// Creates a Tantivy schema optimized for Slovak ngram search
|
|
pub fn create_search_schema() -> Schema {
|
|
let mut schema_builder = Schema::builder();
|
|
|
|
// ID field to link back to PostgreSQL
|
|
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
|
|
|
// Slovak text field with ngram tokenizer for search-as-you-type
|
|
let text_field_indexing = TextFieldIndexing::default()
|
|
.set_tokenizer("slovak") // KEEP THE SAME NAME
|
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
|
|
|
let text_options = TextOptions::default()
|
|
.set_indexing_options(text_field_indexing)
|
|
.set_stored();
|
|
|
|
schema_builder.add_text_field("text_sk", text_options);
|
|
|
|
schema_builder.build()
|
|
}
|
|
|
|
/// Registers the Slovak ngram tokenizer with the index
|
|
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
|
let tokenizer_manager = index.tokenizers();
|
|
|
|
// Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
|
|
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
|
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
|
)
|
|
.filter(RemoveLongFilter::limit(40)) // Remove very long tokens
|
|
.filter(LowerCaser) // Convert to lowercase
|
|
.filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
|
|
.build();
|
|
|
|
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Gets or creates an index for a table with proper Slovak ngram processing
|
|
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
|
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
|
std::fs::create_dir_all(&index_path)?;
|
|
|
|
let index = if index_path.join("meta.json").exists() {
|
|
Index::open_in_dir(&index_path)?
|
|
} else {
|
|
let schema = create_search_schema();
|
|
Index::create_in_dir(&index_path, schema)?
|
|
};
|
|
|
|
// Always register the tokenizer when opening
|
|
register_slovak_tokenizer(&index)?;
|
|
|
|
Ok(index)
|
|
}
|