From 4760f4258942a43aaa6fbd310f924e263227b7bf Mon Sep 17 00:00:00 2001 From: filipriec Date: Mon, 9 Jun 2025 16:36:18 +0200 Subject: [PATCH] slovak language tokenized search --- Cargo.lock | 1 + search/src/lib.rs | 159 ++++++++++++++++++++++--------- server/Cargo.toml | 1 + server/src/bin/manual_indexer.rs | 83 ---------------- server/src/indexer.rs | 59 ++++++------ server/src/lib.rs | 1 + server/src/search_schema.rs | 63 ++++++++++++ 7 files changed, 210 insertions(+), 157 deletions(-) delete mode 100644 server/src/bin/manual_indexer.rs create mode 100644 server/src/search_schema.rs diff --git a/Cargo.lock b/Cargo.lock index ac2de13..a19c193 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2843,6 +2843,7 @@ dependencies = [ "prost", "regex", "rstest", + "rust-stemmers", "search", "serde", "serde_json", diff --git a/search/src/lib.rs b/search/src/lib.rs index 8e95c5c..863e262 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -1,7 +1,11 @@ // src/lib.rs use std::path::Path; -use tantivy::{collector::TopDocs, query::QueryParser, Index, TantivyDocument}; +use tantivy::collector::TopDocs; +use tantivy::query::{BooleanQuery, Occur, Query, TermQuery}; +use tantivy::schema::IndexRecordOption; +use tantivy::tokenizer::Tokenizer; +use tantivy::{Index, TantivyDocument, Term}; use tonic::{Request, Response, Status}; use common::proto::multieko2::search::{ @@ -13,6 +17,43 @@ use tantivy::schema::Value; pub struct SearcherService; +// Normalize diacritics in queries +fn normalize_slovak_text(text: &str) -> String { + text.chars() + .map(|c| match c { + 'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a', + 'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A', + 'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e', + 'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E', + 'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i', + 'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I', + 'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o', + 'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O', + 'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u', + 'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U', + 'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y', + 'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y', + 'č' => 'c', + 'Č' => 'C', + 'ď' => 'd', + 'Ď' => 'D', + 'ľ' => 'l', + 'Ľ' => 'L', + 'ň' => 'n', + 'Ň' => 'N', + 'ř' => 'r', + 'Ř' => 'R', + 'š' => 's', + 'Š' => 'S', + 'ť' => 't', + 'Ť' => 'T', + 'ž' => 'z', + 'Ž' => 'Z', + _ => c, + }) + .collect() +} + #[tonic::async_trait] impl Searcher for SearcherService { async fn search_table( @@ -27,9 +68,7 @@ impl Searcher for SearcherService { return Err(Status::invalid_argument("Query cannot be empty")); } - // Open the index for this table let index_path = Path::new("./tantivy_indexes").join(&table_name); - if !index_path.exists() { return Err(Status::not_found(format!( "No search index found for table '{}'", @@ -37,68 +76,73 @@ impl Searcher for SearcherService { ))); } - // Open the index - let index = Index::open_in_dir(&index_path).map_err(|e| { - Status::internal(format!("Failed to open index: {}", e)) + let index = Index::open_in_dir(&index_path) + .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?; + + register_slovak_tokenizer(&index).map_err(|e| { + Status::internal(format!("Failed to register Slovak tokenizer: {}", e)) })?; - // Create reader and searcher let reader = index.reader().map_err(|e| { Status::internal(format!("Failed to create index reader: {}", e)) })?; - let searcher = reader.searcher(); let schema = index.schema(); - // Get the fields we need - let all_text_field = match schema.get_field("all_text") { - Ok(field) => field, - Err(_) => { - return Err(Status::internal( - "Schema is missing the 'all_text' field.", - )) - } - }; - - let pg_id_field = match schema.get_field("pg_id") { - Ok(field) => field, - Err(_) => { - return Err(Status::internal( - "Schema is missing the 'pg_id' field.", - )) - } - }; - - // Parse the query - let query_parser = - QueryParser::for_index(&index, vec![all_text_field]); - let query = query_parser.parse_query(&query_str).map_err(|e| { - Status::invalid_argument(format!("Invalid query: {}", e)) + let text_sk_field = schema.get_field("text_sk").map_err(|_| { + Status::internal("Schema is missing the 'text_sk' field.") + })?; + let pg_id_field = schema.get_field("pg_id").map_err(|_| { + Status::internal("Schema is missing the 'pg_id' field.") })?; - // Perform the search + // --- FINAL, ROBUST QUERY LOGIC --- + + // 1. Get the exact tokenizer used for indexing the target field. + let mut tokenizer = index + .tokenizer_for_field(text_sk_field) + .map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?; + + // 2. Manually tokenize the user's normalized query string. + // CORRECTED: Store the normalized string in a variable to extend its lifetime. + let normalized_query = normalize_slovak_text(&query_str); + let mut token_stream = tokenizer.token_stream(&normalized_query); + + let mut terms = Vec::new(); + while let Some(token) = token_stream.next() { + terms.push(Term::from_field_text(text_sk_field, &token.text)); + } + + if terms.is_empty() { + return Ok(Response::new(SearchResponse { hits: vec![] })); + } + + // 3. Create a TermQuery for each token. + let term_queries: Vec<(Occur, Box)> = terms + .into_iter() + .map(|term| { + let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); + (Occur::Must, Box::new(term_query) as Box) + }) + .collect(); + + // 4. Combine them into a BooleanQuery. + let final_query = BooleanQuery::new(term_queries); + // --- END OF LOGIC --- + let top_docs = searcher - .search(&query, &TopDocs::with_limit(100)) + .search(&final_query, &TopDocs::with_limit(100)) .map_err(|e| Status::internal(format!("Search failed: {}", e)))?; - // Convert results to our response format let mut hits = Vec::new(); for (score, doc_address) in top_docs { - let doc: TantivyDocument = searcher.doc(doc_address).map_err( - |e| { - Status::internal(format!( - "Failed to retrieve document: {}", - e - )) - }, - )?; + let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| { + Status::internal(format!("Failed to retrieve document: {}", e)) + })?; if let Some(pg_id_value) = doc.get_first(pg_id_field) { if let Some(pg_id) = pg_id_value.as_u64() { - hits.push(Hit { - id: pg_id as i64, - score, - }); + hits.push(Hit { id: pg_id as i64, score }); } } } @@ -107,3 +151,24 @@ impl Searcher for SearcherService { Ok(Response::new(response)) } } + +/// Registers the Slovak ngram tokenizer +fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> { + use tantivy::tokenizer::*; + + let tokenizer_manager = index.tokenizers(); + + if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak" + let slovak_ngram_tokenizer = TextAnalyzer::builder( + NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false + ) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(AsciiFoldingFilter) + .build(); + + tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak" + } + + Ok(()) +} diff --git a/server/Cargo.toml b/server/Cargo.toml index 0be5f73..56c8f03 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -31,6 +31,7 @@ bcrypt = "0.17.0" validator = { version = "0.20.0", features = ["derive"] } uuid = { version = "1.16.0", features = ["serde", "v4"] } jsonwebtoken = "9.3.1" +rust-stemmers = "1.2.0" [lib] name = "server" diff --git a/server/src/bin/manual_indexer.rs b/server/src/bin/manual_indexer.rs deleted file mode 100644 index 25ea7ed..0000000 --- a/server/src/bin/manual_indexer.rs +++ /dev/null @@ -1,83 +0,0 @@ -// In server/src/bin/manual_indexer.rs - -use sqlx::{PgPool, Row}; -use tantivy::schema::*; -use tantivy::{doc, Index}; -use std::path::Path; - -// --- CONFIGURATION --- -// IMPORTANT: Change this to a table name that actually exists and has data in your test DB. -// From your grpcurl output, "2025_test_post" is a good candidate. -const TABLE_TO_INDEX: &str = "2025_test_post2"; -const INDEX_DIR: &str = "./tantivy_indexes"; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // --- Database Connection --- - // This assumes you have a .env file with DATABASE_URL - dotenvy::dotenv().ok(); - let database_url = std::env::var("DATABASE_URL") - .expect("DATABASE_URL must be set in your .env file"); - let pool = PgPool::connect(&database_url).await?; - println!("Connected to database."); - - // --- Tantivy Schema Definition --- - let mut schema_builder = Schema::builder(); - // This field will store the original Postgres row ID. It's crucial. - schema_builder.add_u64_field("pg_id", INDEXED | STORED); - // This field will contain ALL text data from the row, concatenated. - schema_builder.add_text_field("all_text", TEXT | STORED); - let schema = schema_builder.build(); - - // --- Index Creation --- - let index_path = Path::new(INDEX_DIR).join(TABLE_TO_INDEX); - if index_path.exists() { - println!("Removing existing index at: {}", index_path.display()); - std::fs::remove_dir_all(&index_path)?; - } - std::fs::create_dir_all(&index_path)?; - let index = Index::create_in_dir(&index_path, schema.clone())?; - let mut index_writer = index.writer(100_000_000)?; // 100MB heap - - println!("Indexing table: {}", TABLE_TO_INDEX); - - // --- Data Fetching and Indexing --- - let qualified_table = format!("gen.\"{}\"", TABLE_TO_INDEX); - let query_str = format!("SELECT id, to_jsonb(t) AS data FROM {} t", qualified_table); - let rows = sqlx::query(&query_str).fetch_all(&pool).await?; - - if rows.is_empty() { - println!("Warning: No rows found in table '{}'. Index will be empty.", TABLE_TO_INDEX); - } - - let pg_id_field = schema.get_field("pg_id").unwrap(); - let all_text_field = schema.get_field("all_text").unwrap(); - - for row in &rows { - let id: i64 = row.try_get("id")?; - let data: serde_json::Value = row.try_get("data")?; - - // Concatenate all text values from the JSON into one big string. - let mut full_text = String::new(); - if let Some(obj) = data.as_object() { - for value in obj.values() { - if let Some(s) = value.as_str() { - full_text.push_str(s); - full_text.push(' '); - } - } - } - - // Add the document to Tantivy - index_writer.add_document(doc!( - pg_id_field => id as u64, - all_text_field => full_text - ))?; - } - - // --- Finalize --- - index_writer.commit()?; - println!("Successfully indexed {} documents into '{}'", rows.len(), index_path.display()); - - Ok(()) -} diff --git a/server/src/indexer.rs b/server/src/indexer.rs index 87c40de..ec8f370 100644 --- a/server/src/indexer.rs +++ b/server/src/indexer.rs @@ -2,10 +2,11 @@ use std::path::Path; use sqlx::{PgPool, Row}; -use tantivy::schema::{Schema, Term, TEXT, STORED, INDEXED}; +use tantivy::schema::{Schema, Term}; use tantivy::{doc, Index, IndexWriter}; use tokio::sync::mpsc::Receiver; use tracing::{error, info, warn}; +use crate::search_schema; const INDEX_DIR: &str = "./tantivy_indexes"; @@ -25,7 +26,6 @@ pub struct IndexCommandData { } /// The main loop for the background indexer task. -/// It listens for commands on the receiver and updates the Tantivy index. pub async fn indexer_task(pool: PgPool, mut receiver: Receiver) { info!("Background indexer task started."); while let Some(command) = receiver.recv().await { @@ -62,21 +62,13 @@ async fn handle_add_or_update( .await?; let json_data: serde_json::Value = row.try_get("data")?; - // 2. Prepare the Tantivy document - let mut full_text = String::new(); - if let Some(obj) = json_data.as_object() { - for value in obj.values() { - if let Some(s) = value.as_str() { - full_text.push_str(s); - full_text.push(' '); - } - } - } + // 2. Extract all text content for Slovak processing + let slovak_text = extract_text_content(&json_data); // 3. Open the index and write the document let (mut writer, schema) = get_index_writer(&data.table_name)?; let pg_id_field = schema.get_field("pg_id").unwrap(); - let all_text_field = schema.get_field("all_text").unwrap(); + let text_sk_field = schema.get_field("text_sk").unwrap(); // First, delete any existing document with this ID to handle updates let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64); @@ -85,13 +77,13 @@ async fn handle_add_or_update( // Add the new document writer.add_document(doc!( pg_id_field => data.row_id as u64, - all_text_field => full_text + text_sk_field => slovak_text ))?; // 4. Commit changes writer.commit()?; info!( - "Successfully indexed document id:{} for table:{}", + "Successfully indexed Slovak document id:{} for table:{}", data.row_id, data.table_name ); @@ -122,19 +114,32 @@ async fn handle_delete( fn get_index_writer( table_name: &str, ) -> anyhow::Result<(IndexWriter, Schema)> { - let index_path = Path::new(INDEX_DIR).join(table_name); - std::fs::create_dir_all(&index_path)?; - - let index = Index::open_in_dir(&index_path).or_else(|_| { - // If it doesn't exist, create it with the standard schema - let mut schema_builder = Schema::builder(); - schema_builder.add_u64_field("pg_id", INDEXED | STORED); - schema_builder.add_text_field("all_text", TEXT | STORED); - let schema = schema_builder.build(); - Index::create_in_dir(&index_path, schema) - })?; - + let index = search_schema::get_or_create_index(table_name)?; let schema = index.schema(); let writer = index.writer(100_000_000)?; // 100MB heap Ok((writer, schema)) } + +/// Extract all text content from a JSON object for indexing +fn extract_text_content(json_data: &serde_json::Value) -> String { + let mut full_text = String::new(); + + if let Some(obj) = json_data.as_object() { + for value in obj.values() { + match value { + serde_json::Value::String(s) => { + full_text.push_str(s); + full_text.push(' '); + } + serde_json::Value::Number(n) => { + full_text.push_str(&n.to_string()); + full_text.push(' '); + } + // We could recursively handle nested objects if needed + _ => {} + } + } + } + + full_text.trim().to_string() +} diff --git a/server/src/lib.rs b/server/src/lib.rs index 40ab7d8..7d18246 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -2,6 +2,7 @@ pub mod db; pub mod auth; pub mod indexer; +pub mod search_schema; pub mod server; pub mod adresar; pub mod uctovnictvo; diff --git a/server/src/search_schema.rs b/server/src/search_schema.rs new file mode 100644 index 0000000..ee4eb97 --- /dev/null +++ b/server/src/search_schema.rs @@ -0,0 +1,63 @@ +// server/src/search_schema.rs + +use tantivy::schema::*; +use tantivy::tokenizer::*; +use tantivy::Index; +use std::path::Path; + +/// Creates a Tantivy schema optimized for Slovak ngram search +pub fn create_search_schema() -> Schema { + let mut schema_builder = Schema::builder(); + + // ID field to link back to PostgreSQL + schema_builder.add_u64_field("pg_id", INDEXED | STORED); + + // Slovak text field with ngram tokenizer for search-as-you-type + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("slovak") // KEEP THE SAME NAME + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + + schema_builder.add_text_field("text_sk", text_options); + + schema_builder.build() +} + +/// Registers the Slovak ngram tokenizer with the index +pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> { + let tokenizer_manager = index.tokenizers(); + + // Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak" + let slovak_ngram_tokenizer = TextAnalyzer::builder( + NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false + ) + .filter(RemoveLongFilter::limit(40)) // Remove very long tokens + .filter(LowerCaser) // Convert to lowercase + .filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc. + .build(); + + tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME + + Ok(()) +} + +/// Gets or creates an index for a table with proper Slovak ngram processing +pub fn get_or_create_index(table_name: &str) -> tantivy::Result { + let index_path = Path::new("./tantivy_indexes").join(table_name); + std::fs::create_dir_all(&index_path)?; + + let index = if index_path.join("meta.json").exists() { + Index::open_in_dir(&index_path)? + } else { + let schema = create_search_schema(); + Index::create_in_dir(&index_path, schema)? + }; + + // Always register the tokenizer when opening + register_slovak_tokenizer(&index)?; + + Ok(index) +}