search in common module, now fixing layer mixing issue
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -488,6 +488,7 @@ version = "0.3.13"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"prost",
|
"prost",
|
||||||
"serde",
|
"serde",
|
||||||
|
"tantivy",
|
||||||
"tonic",
|
"tonic",
|
||||||
"tonic-build",
|
"tonic-build",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -9,5 +9,8 @@ tonic = "0.13.0"
|
|||||||
prost = "0.13.5"
|
prost = "0.13.5"
|
||||||
serde = { version = "1.0.219", features = ["derive"] }
|
serde = { version = "1.0.219", features = ["derive"] }
|
||||||
|
|
||||||
|
# Search
|
||||||
|
tantivy = { workspace = true }
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
tonic-build = "0.13.0"
|
tonic-build = "0.13.0"
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
// common/src/lib.rs
|
// common/src/lib.rs
|
||||||
|
|
||||||
|
pub mod search;
|
||||||
|
|
||||||
pub mod proto {
|
pub mod proto {
|
||||||
pub mod multieko2 {
|
pub mod multieko2 {
|
||||||
pub mod adresar {
|
pub mod adresar {
|
||||||
|
|||||||
78
common/src/search.rs
Normal file
78
common/src/search.rs
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
// common/src/search.rs
|
||||||
|
|
||||||
|
use tantivy::schema::*;
|
||||||
|
use tantivy::tokenizer::*;
|
||||||
|
use tantivy::Index;
|
||||||
|
|
||||||
|
/// Creates a hybrid Slovak search schema with optimized prefix fields.
|
||||||
|
pub fn create_search_schema() -> Schema {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
|
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||||
|
|
||||||
|
// FIELD 1: For prefixes (1-15 chars).
|
||||||
|
let short_prefix_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("slovak_prefix_edge")
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let short_prefix_options = TextOptions::default()
|
||||||
|
.set_indexing_options(short_prefix_indexing)
|
||||||
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("prefix_edge", short_prefix_options);
|
||||||
|
|
||||||
|
// FIELD 2: For the full word.
|
||||||
|
let full_word_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("slovak_prefix_full")
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let full_word_options = TextOptions::default()
|
||||||
|
.set_indexing_options(full_word_indexing)
|
||||||
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("prefix_full", full_word_options);
|
||||||
|
|
||||||
|
// NGRAM FIELD: For substring matching.
|
||||||
|
let ngram_field_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("slovak_ngram")
|
||||||
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
|
let ngram_options = TextOptions::default()
|
||||||
|
.set_indexing_options(ngram_field_indexing)
|
||||||
|
.set_stored();
|
||||||
|
schema_builder.add_text_field("text_ngram", ngram_options);
|
||||||
|
|
||||||
|
schema_builder.build()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Registers all necessary Slovak tokenizers with the index.
|
||||||
|
///
|
||||||
|
/// This must be called by ANY process that opens the index
|
||||||
|
/// to ensure the tokenizers are loaded into memory.
|
||||||
|
pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
|
||||||
|
let tokenizer_manager = index.tokenizers();
|
||||||
|
|
||||||
|
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
||||||
|
let edge_tokenizer =
|
||||||
|
TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer);
|
||||||
|
|
||||||
|
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
||||||
|
let full_tokenizer =
|
||||||
|
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_prefix_full", full_tokenizer);
|
||||||
|
|
||||||
|
// NGRAM TOKENIZER: For substring matching.
|
||||||
|
let ngram_tokenizer =
|
||||||
|
TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
||||||
|
.filter(RemoveLongFilter::limit(40))
|
||||||
|
.filter(LowerCaser)
|
||||||
|
.filter(AsciiFoldingFilter)
|
||||||
|
.build();
|
||||||
|
tokenizer_manager.register("slovak_ngram", ngram_tokenizer);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -15,6 +15,7 @@ use common::proto::multieko2::search::{
|
|||||||
};
|
};
|
||||||
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
|
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
|
||||||
use common::proto::multieko2::search::searcher_server::Searcher;
|
use common::proto::multieko2::search::searcher_server::Searcher;
|
||||||
|
use common::search::register_slovak_tokenizers;
|
||||||
use tantivy::schema::Value;
|
use tantivy::schema::Value;
|
||||||
|
|
||||||
pub struct SearcherService;
|
pub struct SearcherService;
|
||||||
@@ -217,41 +218,3 @@ impl Searcher for SearcherService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This function is now an exact mirror of the one in `server/src/search_schema.rs`
|
|
||||||
fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
|
|
||||||
use tantivy::tokenizer::*;
|
|
||||||
|
|
||||||
let tokenizer_manager = index.tokenizers();
|
|
||||||
|
|
||||||
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
|
||||||
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
|
||||||
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(AsciiFoldingFilter)
|
|
||||||
.build();
|
|
||||||
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
|
||||||
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
|
||||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(AsciiFoldingFilter)
|
|
||||||
.build();
|
|
||||||
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
// NGRAM TOKENIZER: For substring matching.
|
|
||||||
if tokenizer_manager.get("slovak_ngram").is_none() {
|
|
||||||
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(AsciiFoldingFilter)
|
|
||||||
.build();
|
|
||||||
tokenizer_manager.register("slovak_ngram", tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,97 +1,26 @@
|
|||||||
// server/src/search_schema.rs
|
// server/src/search_schema.rs
|
||||||
|
|
||||||
use tantivy::schema::*;
|
|
||||||
use tantivy::tokenizer::*;
|
|
||||||
use tantivy::Index;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use tantivy::Index;
|
||||||
|
|
||||||
/// Creates a hybrid Slovak search schema with optimized prefix fields.
|
// Re-export the functions from the common crate.
|
||||||
pub fn create_search_schema() -> Schema {
|
// This makes them available as `crate::search_schema::create_search_schema`, etc.
|
||||||
let mut schema_builder = Schema::builder();
|
pub use common::search::{create_search_schema, register_slovak_tokenizers};
|
||||||
|
|
||||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
|
||||||
|
|
||||||
// FIELD 1: For prefixes (1-15 chars).
|
|
||||||
let short_prefix_indexing = TextFieldIndexing::default()
|
|
||||||
.set_tokenizer("slovak_prefix_edge")
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
|
||||||
let short_prefix_options = TextOptions::default()
|
|
||||||
.set_indexing_options(short_prefix_indexing)
|
|
||||||
.set_stored();
|
|
||||||
schema_builder.add_text_field("prefix_edge", short_prefix_options);
|
|
||||||
|
|
||||||
// FIELD 2: For the full word.
|
|
||||||
let full_word_indexing = TextFieldIndexing::default()
|
|
||||||
.set_tokenizer("slovak_prefix_full")
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
|
||||||
let full_word_options = TextOptions::default()
|
|
||||||
.set_indexing_options(full_word_indexing)
|
|
||||||
.set_stored();
|
|
||||||
schema_builder.add_text_field("prefix_full", full_word_options);
|
|
||||||
|
|
||||||
// NGRAM FIELD: For substring matching.
|
|
||||||
let ngram_field_indexing = TextFieldIndexing::default()
|
|
||||||
.set_tokenizer("slovak_ngram")
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
|
||||||
let ngram_options = TextOptions::default()
|
|
||||||
.set_indexing_options(ngram_field_indexing)
|
|
||||||
.set_stored();
|
|
||||||
schema_builder.add_text_field("text_ngram", ngram_options);
|
|
||||||
|
|
||||||
schema_builder.build()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Registers all necessary Slovak tokenizers with the index.
|
|
||||||
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
|
||||||
let tokenizer_manager = index.tokenizers();
|
|
||||||
|
|
||||||
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
|
||||||
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
|
||||||
// YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit.
|
|
||||||
let tokenizer = TextAnalyzer::builder(
|
|
||||||
NgramTokenizer::new(1, 15, true)?
|
|
||||||
)
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(AsciiFoldingFilter)
|
|
||||||
.build();
|
|
||||||
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
|
||||||
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
|
||||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(AsciiFoldingFilter)
|
|
||||||
.build();
|
|
||||||
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
// NGRAM TOKENIZER: For substring matching.
|
|
||||||
if tokenizer_manager.get("slovak_ngram").is_none() {
|
|
||||||
let tokenizer = TextAnalyzer::builder(
|
|
||||||
NgramTokenizer::new(3, 3, false)?
|
|
||||||
)
|
|
||||||
.filter(RemoveLongFilter::limit(40))
|
|
||||||
.filter(LowerCaser)
|
|
||||||
.filter(AsciiFoldingFilter)
|
|
||||||
.build();
|
|
||||||
tokenizer_manager.register("slovak_ngram", tokenizer);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/// Gets an existing index or creates a new one.
|
||||||
|
/// This function now uses the shared logic from the `common` crate.
|
||||||
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
||||||
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
||||||
std::fs::create_dir_all(&index_path)?;
|
std::fs::create_dir_all(&index_path)?;
|
||||||
|
|
||||||
let index = if index_path.join("meta.json").exists() {
|
let index = if index_path.join("meta.json").exists() {
|
||||||
Index::open_in_dir(&index_path)?
|
Index::open_in_dir(&index_path)?
|
||||||
} else {
|
} else {
|
||||||
let schema = create_search_schema();
|
let schema = create_search_schema();
|
||||||
Index::create_in_dir(&index_path, schema)?
|
Index::create_in_dir(&index_path, schema)?
|
||||||
};
|
};
|
||||||
register_slovak_tokenizer(&index)?;
|
|
||||||
|
// This now calls the single, authoritative function from `common`.
|
||||||
|
register_slovak_tokenizers(&index)?;
|
||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user