From 350c522d1968c3f50eb059c8873e669fee01df70 Mon Sep 17 00:00:00 2001 From: filipriec Date: Tue, 10 Jun 2025 00:22:31 +0200 Subject: [PATCH] better search but still has some flaws. It at least works, even tho its not perfect. Needs more testing, but im pretty happy with it rn, keeping it this way --- search/src/lib.rs | 179 ++++++++++++++++++++++++++---------- server/src/indexer.rs | 32 +++---- server/src/search_schema.rs | 82 ++++++++++++----- 3 files changed, 202 insertions(+), 91 deletions(-) diff --git a/search/src/lib.rs b/search/src/lib.rs index 863e262..9df77ba 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -1,18 +1,20 @@ -// src/lib.rs +// search/src/lib.rs use std::path::Path; use tantivy::collector::TopDocs; -use tantivy::query::{BooleanQuery, Occur, Query, TermQuery}; +use tantivy::query::{ + BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser, + TermQuery, +}; use tantivy::schema::IndexRecordOption; -use tantivy::tokenizer::Tokenizer; use tantivy::{Index, TantivyDocument, Term}; use tonic::{Request, Response, Status}; use common::proto::multieko2::search::{ search_response::Hit, SearchRequest, SearchResponse, }; -use common::proto::multieko2::search::searcher_server::Searcher; pub use common::proto::multieko2::search::searcher_server::SearcherServer; +use common::proto::multieko2::search::searcher_server::Searcher; use tantivy::schema::Value; pub struct SearcherService; @@ -79,8 +81,8 @@ impl Searcher for SearcherService { let index = Index::open_in_dir(&index_path) .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?; - register_slovak_tokenizer(&index).map_err(|e| { - Status::internal(format!("Failed to register Slovak tokenizer: {}", e)) + register_slovak_tokenizers(&index).map_err(|e| { + Status::internal(format!("Failed to register Slovak tokenizers: {}", e)) })?; let reader = index.reader().map_err(|e| { @@ -89,49 +91,109 @@ impl Searcher for SearcherService { let searcher = reader.searcher(); let schema = index.schema(); - let text_sk_field = schema.get_field("text_sk").map_err(|_| { - Status::internal("Schema is missing the 'text_sk' field.") + let prefix_edge_field = schema.get_field("prefix_edge").map_err(|_| { + Status::internal("Schema is missing the 'prefix_edge' field.") + })?; + let prefix_full_field = schema.get_field("prefix_full").map_err(|_| { + Status::internal("Schema is missing the 'prefix_full' field.") + })?; + let text_ngram_field = schema.get_field("text_ngram").map_err(|_| { + Status::internal("Schema is missing the 'text_ngram' field.") })?; let pg_id_field = schema.get_field("pg_id").map_err(|_| { Status::internal("Schema is missing the 'pg_id' field.") })?; - // --- FINAL, ROBUST QUERY LOGIC --- - - // 1. Get the exact tokenizer used for indexing the target field. - let mut tokenizer = index - .tokenizer_for_field(text_sk_field) - .map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?; - - // 2. Manually tokenize the user's normalized query string. - // CORRECTED: Store the normalized string in a variable to extend its lifetime. let normalized_query = normalize_slovak_text(&query_str); - let mut token_stream = tokenizer.token_stream(&normalized_query); + let words: Vec<&str> = normalized_query.split_whitespace().collect(); - let mut terms = Vec::new(); - while let Some(token) = token_stream.next() { - terms.push(Term::from_field_text(text_sk_field, &token.text)); - } - - if terms.is_empty() { + if words.is_empty() { return Ok(Response::new(SearchResponse { hits: vec![] })); } - // 3. Create a TermQuery for each token. - let term_queries: Vec<(Occur, Box)> = terms - .into_iter() - .map(|term| { - let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); - (Occur::Must, Box::new(term_query) as Box) - }) - .collect(); + let mut query_layers: Vec<(Occur, Box)> = Vec::new(); - // 4. Combine them into a BooleanQuery. - let final_query = BooleanQuery::new(term_queries); - // --- END OF LOGIC --- + // =============================== + // LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0) + // =============================== + { + let mut must_clauses: Vec<(Occur, Box)> = Vec::new(); + for word in &words { + let edge_term = + Term::from_field_text(prefix_edge_field, word); + let full_term = + Term::from_field_text(prefix_full_field, word); + + let per_word_query = BooleanQuery::new(vec![ + ( + Occur::Should, + Box::new(TermQuery::new( + edge_term, + IndexRecordOption::Basic, + )), + ), + ( + Occur::Should, + Box::new(TermQuery::new( + full_term, + IndexRecordOption::Basic, + )), + ), + ]); + must_clauses.push((Occur::Must, Box::new(per_word_query) as Box)); + } + + if !must_clauses.is_empty() { + let prefix_query = BooleanQuery::new(must_clauses); + let boosted_query = + BoostQuery::new(Box::new(prefix_query), 4.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + } + + // =============================== + // LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0) + // =============================== + { + let last_word = words.last().unwrap(); + let fuzzy_term = + Term::from_field_text(prefix_full_field, last_word); + let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true); + let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + + // =============================== + // LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0) + // =============================== + if words.len() > 1 { + let slop_parser = + QueryParser::for_index(&index, vec![prefix_full_field]); + let slop_query_str = format!("\"{}\"~3", normalized_query); + if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) { + let boosted_query = BoostQuery::new(slop_query, 2.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + } + + // =============================== + // LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0) + // =============================== + { + let ngram_parser = + QueryParser::for_index(&index, vec![text_ngram_field]); + if let Ok(ngram_query) = + ngram_parser.parse_query(&normalized_query) + { + let boosted_query = BoostQuery::new(ngram_query, 1.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + } + + let master_query = BooleanQuery::new(query_layers); let top_docs = searcher - .search(&final_query, &TopDocs::with_limit(100)) + .search(&master_query, &TopDocs::with_limit(100)) .map_err(|e| Status::internal(format!("Search failed: {}", e)))?; let mut hits = Vec::new(); @@ -142,7 +204,10 @@ impl Searcher for SearcherService { if let Some(pg_id_value) = doc.get_first(pg_id_field) { if let Some(pg_id) = pg_id_value.as_u64() { - hits.push(Hit { id: pg_id as i64, score }); + hits.push(Hit { + id: pg_id as i64, + score, + }); } } } @@ -152,22 +217,40 @@ impl Searcher for SearcherService { } } -/// Registers the Slovak ngram tokenizer -fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> { +/// This function is now an exact mirror of the one in `server/src/search_schema.rs` +fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> { use tantivy::tokenizer::*; let tokenizer_manager = index.tokenizers(); - if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak" - let slovak_ngram_tokenizer = TextAnalyzer::builder( - NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false - ) - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(AsciiFoldingFilter) - .build(); + // TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars) + if tokenizer_manager.get("slovak_prefix_edge").is_none() { + let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_prefix_edge", tokenizer); + } - tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak" + // TOKENIZER for `prefix_full`: Simple word tokenizer + if tokenizer_manager.get("slovak_prefix_full").is_none() { + let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_prefix_full", tokenizer); + } + + // NGRAM TOKENIZER: For substring matching. + if tokenizer_manager.get("slovak_ngram").is_none() { + let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_ngram", tokenizer); } Ok(()) diff --git a/server/src/indexer.rs b/server/src/indexer.rs index ec8f370..2abd002 100644 --- a/server/src/indexer.rs +++ b/server/src/indexer.rs @@ -1,11 +1,11 @@ -// src/indexer.rs +// server/src/indexer.rs -use std::path::Path; use sqlx::{PgPool, Row}; -use tantivy::schema::{Schema, Term}; -use tantivy::{doc, Index, IndexWriter}; +use tantivy::schema::Term; +use tantivy::{doc, IndexWriter}; use tokio::sync::mpsc::Receiver; use tracing::{error, info, warn}; +use tantivy::schema::Schema; use crate::search_schema; const INDEX_DIR: &str = "./tantivy_indexes"; @@ -49,44 +49,39 @@ async fn handle_add_or_update( pool: &PgPool, data: IndexCommandData, ) -> anyhow::Result<()> { - // 1. Fetch the full row data from PostgreSQL let qualified_table = format!("gen.\"{}\"", data.table_name); let query_str = format!( "SELECT to_jsonb(t) AS data FROM {} t WHERE id = $1", qualified_table ); - let row = sqlx::query(&query_str) .bind(data.row_id) .fetch_one(pool) .await?; let json_data: serde_json::Value = row.try_get("data")?; - - // 2. Extract all text content for Slovak processing let slovak_text = extract_text_content(&json_data); - // 3. Open the index and write the document let (mut writer, schema) = get_index_writer(&data.table_name)?; let pg_id_field = schema.get_field("pg_id").unwrap(); - let text_sk_field = schema.get_field("text_sk").unwrap(); + let prefix_edge_field = schema.get_field("prefix_edge").unwrap(); + let prefix_full_field = schema.get_field("prefix_full").unwrap(); + let text_ngram_field = schema.get_field("text_ngram").unwrap(); - // First, delete any existing document with this ID to handle updates let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64); writer.delete_term(id_term); - // Add the new document writer.add_document(doc!( pg_id_field => data.row_id as u64, - text_sk_field => slovak_text + prefix_edge_field => slovak_text.clone(), + prefix_full_field => slovak_text.clone(), + text_ngram_field => slovak_text ))?; - // 4. Commit changes writer.commit()?; info!( - "Successfully indexed Slovak document id:{} for table:{}", + "Successfully indexed document id:{} for table:{}", data.row_id, data.table_name ); - Ok(()) } @@ -123,7 +118,7 @@ fn get_index_writer( /// Extract all text content from a JSON object for indexing fn extract_text_content(json_data: &serde_json::Value) -> String { let mut full_text = String::new(); - + if let Some(obj) = json_data.as_object() { for value in obj.values() { match value { @@ -135,11 +130,10 @@ fn extract_text_content(json_data: &serde_json::Value) -> String { full_text.push_str(&n.to_string()); full_text.push(' '); } - // We could recursively handle nested objects if needed _ => {} } } } - + full_text.trim().to_string() } diff --git a/server/src/search_schema.rs b/server/src/search_schema.rs index ee4eb97..d158619 100644 --- a/server/src/search_schema.rs +++ b/server/src/search_schema.rs @@ -5,59 +5,93 @@ use tantivy::tokenizer::*; use tantivy::Index; use std::path::Path; -/// Creates a Tantivy schema optimized for Slovak ngram search +/// Creates a hybrid Slovak search schema with optimized prefix fields. pub fn create_search_schema() -> Schema { let mut schema_builder = Schema::builder(); - // ID field to link back to PostgreSQL schema_builder.add_u64_field("pg_id", INDEXED | STORED); - // Slovak text field with ngram tokenizer for search-as-you-type - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("slovak") // KEEP THE SAME NAME + // FIELD 1: For prefixes (1-15 chars). + let short_prefix_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak_prefix_edge") .set_index_option(IndexRecordOption::WithFreqsAndPositions); - - let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) + let short_prefix_options = TextOptions::default() + .set_indexing_options(short_prefix_indexing) .set_stored(); + schema_builder.add_text_field("prefix_edge", short_prefix_options); - schema_builder.add_text_field("text_sk", text_options); + // FIELD 2: For the full word. + let full_word_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak_prefix_full") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let full_word_options = TextOptions::default() + .set_indexing_options(full_word_indexing) + .set_stored(); + schema_builder.add_text_field("prefix_full", full_word_options); + + // NGRAM FIELD: For substring matching. + let ngram_field_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak_ngram") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let ngram_options = TextOptions::default() + .set_indexing_options(ngram_field_indexing) + .set_stored(); + schema_builder.add_text_field("text_ngram", ngram_options); schema_builder.build() } -/// Registers the Slovak ngram tokenizer with the index +/// Registers all necessary Slovak tokenizers with the index. pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> { let tokenizer_manager = index.tokenizers(); - // Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak" - let slovak_ngram_tokenizer = TextAnalyzer::builder( - NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false - ) - .filter(RemoveLongFilter::limit(40)) // Remove very long tokens - .filter(LowerCaser) // Convert to lowercase - .filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc. - .build(); + // TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars) + if tokenizer_manager.get("slovak_prefix_edge").is_none() { + // YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit. + let tokenizer = TextAnalyzer::builder( + NgramTokenizer::new(1, 15, true)? + ) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_prefix_edge", tokenizer); + } - tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME + // TOKENIZER for `prefix_full`: Simple word tokenizer + if tokenizer_manager.get("slovak_prefix_full").is_none() { + let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_prefix_full", tokenizer); + } + + // NGRAM TOKENIZER: For substring matching. + if tokenizer_manager.get("slovak_ngram").is_none() { + let tokenizer = TextAnalyzer::builder( + NgramTokenizer::new(3, 3, false)? + ) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + tokenizer_manager.register("slovak_ngram", tokenizer); + } Ok(()) } -/// Gets or creates an index for a table with proper Slovak ngram processing pub fn get_or_create_index(table_name: &str) -> tantivy::Result { let index_path = Path::new("./tantivy_indexes").join(table_name); std::fs::create_dir_all(&index_path)?; - let index = if index_path.join("meta.json").exists() { Index::open_in_dir(&index_path)? } else { let schema = create_search_schema(); Index::create_in_dir(&index_path, schema)? }; - - // Always register the tokenizer when opening register_slovak_tokenizer(&index)?; - Ok(index) }