From 036e12f3452057a06062117888e0cafa0a2ec8d6 Mon Sep 17 00:00:00 2001 From: Priec Date: Wed, 29 Apr 2026 01:08:59 +0200 Subject: [PATCH] indexing done by the profile and not table --- common/src/search.rs | 13 +- search/src/lib.rs | 356 +++++++++++++++++++++++-------------------- 2 files changed, 204 insertions(+), 165 deletions(-) diff --git a/common/src/search.rs b/common/src/search.rs index 5195c95..1ba9509 100644 --- a/common/src/search.rs +++ b/common/src/search.rs @@ -5,9 +5,14 @@ use tantivy::schema::*; use tantivy::tokenizer::*; use tantivy::Index; -/// Returns the on-disk path for a profile/table search index. -pub fn search_index_path(root: &Path, profile_name: &str, table_name: &str) -> PathBuf { - root.join(profile_name).join(table_name) +/// Returns the on-disk path for a profile search index. +pub fn search_index_path(root: &Path, profile_name: &str) -> PathBuf { + root.join(profile_name) +} + +/// Returns the unique index key for one table row inside a profile index. +pub fn search_row_key(table_name: &str, row_id: i64) -> String { + format!("{}:{}", table_name, row_id) } /// Creates a hybrid Slovak search schema with optimized prefix fields. @@ -15,6 +20,8 @@ pub fn create_search_schema() -> Schema { let mut schema_builder = Schema::builder(); schema_builder.add_u64_field("pg_id", INDEXED | STORED); + schema_builder.add_text_field("table_name", STRING | STORED); + schema_builder.add_text_field("row_key", STRING | STORED); // For prefixes (1-4 chars). let short_prefix_indexing = TextFieldIndexing::default() diff --git a/search/src/lib.rs b/search/src/lib.rs index 228cdc6..47e1e2c 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -30,12 +30,18 @@ pub struct SearcherService { pub pool: PgPool, } -struct SearchTarget { - table_name: String, - qualified_table: String, +struct SearchScope { + profile_name: String, + requested_table: Option, index_path: PathBuf, } +struct SearchCandidate { + score: f32, + pg_id: i64, + table_name: String, +} + fn normalize_slovak_text(text: &str) -> String { text.chars() .map(|c| match c { @@ -96,20 +102,21 @@ fn qualify_profile_table(profile_name: &str, table_name: &str) -> String { } async fn profile_exists(pool: &PgPool, profile_name: &str) -> Result { - let exists = sqlx::query_scalar::<_, bool>("SELECT EXISTS(SELECT 1 FROM schemas WHERE name = $1)") - .bind(profile_name) - .fetch_one(pool) - .await - .map_err(|e| Status::internal(format!("Profile lookup failed: {}", e)))?; + let exists = + sqlx::query_scalar::<_, bool>("SELECT EXISTS(SELECT 1 FROM schemas WHERE name = $1)") + .bind(profile_name) + .fetch_one(pool) + .await + .map_err(|e| Status::internal(format!("Profile lookup failed: {}", e)))?; Ok(exists) } // Scope resolution -async fn resolve_search_targets( +async fn resolve_search_scope( pool: &PgPool, profile_name: &str, requested_table: Option<&str>, -) -> Result, Status> { +) -> Result { validate_identifier(profile_name, "profile_name")?; if !profile_exists(pool, profile_name).await? { @@ -119,7 +126,9 @@ async fn resolve_search_targets( ))); } - let tables = if let Some(table_name) = requested_table.filter(|value| !value.trim().is_empty()) { + let requested_table = if let Some(table_name) = + requested_table.filter(|value| !value.trim().is_empty()) + { validate_identifier(table_name, "table_name")?; let row = sqlx::query_scalar::<_, String>( @@ -136,38 +145,21 @@ async fn resolve_search_targets( .await .map_err(|e| Status::internal(format!("Table lookup failed: {}", e)))?; - let table_name = row.ok_or_else(|| { + Some(row.ok_or_else(|| { Status::not_found(format!( "Table '{}' was not found in profile '{}'", table_name, profile_name )) - })?; - - vec![table_name] + })?) } else { - sqlx::query_scalar::<_, String>( - r#" - SELECT td.table_name - FROM table_definitions td - JOIN schemas s ON td.schema_id = s.id - WHERE s.name = $1 - ORDER BY td.table_name - "#, - ) - .bind(profile_name) - .fetch_all(pool) - .await - .map_err(|e| Status::internal(format!("Profile table lookup failed: {}", e)))? + None }; - Ok(tables - .into_iter() - .map(|table_name| SearchTarget { - qualified_table: qualify_profile_table(profile_name, &table_name), - index_path: search_index_path(Path::new(INDEX_ROOT), profile_name, &table_name), - table_name, - }) - .collect()) + Ok(SearchScope { + profile_name: profile_name.to_string(), + requested_table, + index_path: search_index_path(Path::new(INDEX_ROOT), profile_name), + }) } // Query building @@ -175,8 +167,12 @@ fn build_query( index: &Index, normalized_query: &str, mode: SearchMode, + table_filter: Option<&str>, ) -> Result, Status> { let schema = index.schema(); + let table_name_field = schema + .get_field("table_name") + .map_err(|_| Status::internal("Schema is missing the 'table_name' field."))?; let prefix_edge_field = schema .get_field("prefix_edge") .map_err(|_| Status::internal("Schema is missing the 'prefix_edge' field."))?; @@ -192,7 +188,7 @@ fn build_query( return Ok(None); } - if matches!(mode, SearchMode::Exact) { + let content_query: Box = if matches!(mode, SearchMode::Exact) { let exact_parser = QueryParser::for_index(index, vec![prefix_full_field]); let exact_query_str = if words.len() == 1 { normalized_query.to_string() @@ -203,80 +199,92 @@ fn build_query( let exact_query = exact_parser .parse_query(&exact_query_str) .map_err(|e| Status::internal(format!("Failed to build exact query: {}", e)))?; + Box::new(exact_query) + } else { + let mut query_layers: Vec<(Occur, Box)> = Vec::new(); - return Ok(Some(BooleanQuery::new(vec![( + // Layer 1: prefix + { + let mut must_clauses: Vec<(Occur, Box)> = Vec::new(); + for word in &words { + let edge_term = Term::from_field_text(prefix_edge_field, word); + let full_term = Term::from_field_text(prefix_full_field, word); + + let per_word_query = BooleanQuery::new(vec![ + ( + Occur::Should, + Box::new(TermQuery::new(edge_term, IndexRecordOption::Basic)), + ), + ( + Occur::Should, + Box::new(TermQuery::new(full_term, IndexRecordOption::Basic)), + ), + ]); + must_clauses.push((Occur::Must, Box::new(per_word_query))); + } + + if !must_clauses.is_empty() { + let prefix_query = BooleanQuery::new(must_clauses); + let boosted_query = BoostQuery::new(Box::new(prefix_query), 4.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + } + + // Layer 2: fuzzy + { + let last_word = words + .last() + .ok_or_else(|| Status::internal("Query normalization lost all tokens"))?; + let fuzzy_term = Term::from_field_text(prefix_full_field, last_word); + let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true); + let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + + // Layer 3: phrase + if words.len() > 1 { + let slop_parser = QueryParser::for_index(index, vec![prefix_full_field]); + let slop_query_str = format!("\"{}\"~3", normalized_query); + if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) { + let boosted_query = BoostQuery::new(slop_query, 2.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + } + + // Layer 4: ngram + { + let ngram_parser = QueryParser::for_index(index, vec![text_ngram_field]); + if let Ok(ngram_query) = ngram_parser.parse_query(normalized_query) { + let boosted_query = BoostQuery::new(ngram_query, 1.0); + query_layers.push((Occur::Should, Box::new(boosted_query))); + } + } + + Box::new(BooleanQuery::new(query_layers)) + }; + + let mut clauses: Vec<(Occur, Box)> = vec![(Occur::Must, content_query)]; + if let Some(table_name) = table_filter { + let table_term = Term::from_field_text(table_name_field, table_name); + clauses.push(( Occur::Must, - Box::new(exact_query), - )]))); + Box::new(TermQuery::new(table_term, IndexRecordOption::Basic)), + )); } - let mut query_layers: Vec<(Occur, Box)> = Vec::new(); - - // Layer 1: prefix - { - let mut must_clauses: Vec<(Occur, Box)> = Vec::new(); - for word in &words { - let edge_term = Term::from_field_text(prefix_edge_field, word); - let full_term = Term::from_field_text(prefix_full_field, word); - - let per_word_query = BooleanQuery::new(vec![ - ( - Occur::Should, - Box::new(TermQuery::new(edge_term, IndexRecordOption::Basic)), - ), - ( - Occur::Should, - Box::new(TermQuery::new(full_term, IndexRecordOption::Basic)), - ), - ]); - must_clauses.push((Occur::Must, Box::new(per_word_query))); - } - - if !must_clauses.is_empty() { - let prefix_query = BooleanQuery::new(must_clauses); - let boosted_query = BoostQuery::new(Box::new(prefix_query), 4.0); - query_layers.push((Occur::Should, Box::new(boosted_query))); - } - } - - // Layer 2: fuzzy - { - let last_word = words - .last() - .ok_or_else(|| Status::internal("Query normalization lost all tokens"))?; - let fuzzy_term = Term::from_field_text(prefix_full_field, last_word); - let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true); - let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0); - query_layers.push((Occur::Should, Box::new(boosted_query))); - } - - // Layer 3: phrase - if words.len() > 1 { - let slop_parser = QueryParser::for_index(index, vec![prefix_full_field]); - let slop_query_str = format!("\"{}\"~3", normalized_query); - if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) { - let boosted_query = BoostQuery::new(slop_query, 2.0); - query_layers.push((Occur::Should, Box::new(boosted_query))); - } - } - - // Layer 4: ngram - { - let ngram_parser = QueryParser::for_index(index, vec![text_ngram_field]); - if let Ok(ngram_query) = ngram_parser.parse_query(normalized_query) { - let boosted_query = BoostQuery::new(ngram_query, 1.0); - query_layers.push((Occur::Should, Box::new(boosted_query))); - } - } - - Ok(Some(BooleanQuery::new(query_layers))) + Ok(Some(BooleanQuery::new(clauses))) } // Empty query -async fn fetch_default_hits(pool: &PgPool, target: &SearchTarget) -> Result, Status> { +async fn fetch_default_hits( + pool: &PgPool, + profile_name: &str, + table_name: &str, +) -> Result, Status> { let sql = format!( "SELECT id, to_jsonb(t) AS data FROM {} t WHERE deleted = FALSE ORDER BY id DESC LIMIT {}", - target.qualified_table, DEFAULT_RESULT_LIMIT + qualify_profile_table(profile_name, table_name), + DEFAULT_RESULT_LIMIT ); let rows = sqlx::query(&sql) @@ -293,29 +301,34 @@ async fn fetch_default_hits(pool: &PgPool, target: &SearchTarget) -> Result Result, Status> { - if !target.index_path.exists() { + if !scope.index_path.exists() { return Ok(vec![]); } - let index = Index::open_in_dir(&target.index_path) + let index = Index::open_in_dir(&scope.index_path) .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?; register_slovak_tokenizers(&index) .map_err(|e| Status::internal(format!("Failed to register Slovak tokenizers: {}", e)))?; - let Some(master_query) = build_query(&index, &normalize_slovak_text(query_str), mode)? else { + let Some(master_query) = build_query( + &index, + &normalize_slovak_text(query_str), + mode, + scope.requested_table.as_deref(), + )? else { return Ok(vec![]); }; @@ -327,6 +340,9 @@ async fn search_target( let pg_id_field = schema .get_field("pg_id") .map_err(|_| Status::internal("Schema is missing the 'pg_id' field."))?; + let table_name_field = schema + .get_field("table_name") + .map_err(|_| Status::internal("Schema is missing the 'table_name' field."))?; let top_docs = searcher .search(&master_query, &TopDocs::with_limit(SEARCH_RESULT_LIMIT)) @@ -336,50 +352,74 @@ async fn search_target( return Ok(vec![]); } - let mut scored_ids: Vec<(f32, u64)> = Vec::new(); + let mut candidates: Vec = Vec::new(); for (score, doc_address) in top_docs { let doc: TantivyDocument = searcher .doc(doc_address) .map_err(|e| Status::internal(format!("Failed to retrieve document: {}", e)))?; - if let Some(pg_id_value) = doc.get_first(pg_id_field) { - if let Some(pg_id) = pg_id_value.as_u64() { - scored_ids.push((score, pg_id)); - } - } + let Some(pg_id_value) = doc.get_first(pg_id_field) else { + continue; + }; + let Some(table_name_value) = doc.get_first(table_name_field) else { + continue; + }; + let Some(pg_id) = pg_id_value.as_u64() else { + continue; + }; + let Some(table_name) = table_name_value.as_str() else { + continue; + }; + candidates.push(SearchCandidate { + score, + pg_id: pg_id as i64, + table_name: table_name.to_string(), + }); } - if scored_ids.is_empty() { + if candidates.is_empty() { return Ok(vec![]); } - let pg_ids: Vec = scored_ids.iter().map(|(_, id)| *id as i64).collect(); - let sql = format!( - "SELECT id, to_jsonb(t) AS data FROM {} t WHERE deleted = FALSE AND id = ANY($1)", - target.qualified_table - ); - - let rows = sqlx::query(&sql) - .bind(&pg_ids) - .fetch_all(pool) - .await - .map_err(|e| Status::internal(format!("Database query failed: {}", e)))?; - - let mut content_map: HashMap = HashMap::new(); - for row in rows { - let id: i64 = row.try_get("id").unwrap_or_default(); - let json_data: serde_json::Value = row.try_get("data").unwrap_or_default(); - content_map.insert(id, json_data.to_string()); + let mut ids_by_table: HashMap> = HashMap::new(); + for candidate in &candidates { + ids_by_table + .entry(candidate.table_name.clone()) + .or_default() + .push(candidate.pg_id); } - Ok(scored_ids + let mut content_map: HashMap<(String, i64), String> = HashMap::new(); + for (table_name, pg_ids) in ids_by_table { + validate_identifier(&table_name, "table_name")?; + let sql = format!( + "SELECT id, to_jsonb(t) AS data FROM {} t WHERE deleted = FALSE AND id = ANY($1)", + qualify_profile_table(&scope.profile_name, &table_name) + ); + + let rows = sqlx::query(&sql) + .bind(&pg_ids) + .fetch_all(pool) + .await + .map_err(|e| Status::internal(format!("Database query failed: {}", e)))?; + + for row in rows { + let id: i64 = row.try_get("id").unwrap_or_default(); + let json_data: serde_json::Value = row.try_get("data").unwrap_or_default(); + content_map.insert((table_name.clone(), id), json_data.to_string()); + } + } + + Ok(candidates .into_iter() - .filter_map(|(score, pg_id)| { - content_map.get(&(pg_id as i64)).map(|content_json| Hit { - id: pg_id as i64, - score, - content_json: content_json.clone(), - table_name: target.table_name.clone(), - }) + .filter_map(|candidate| { + content_map + .get(&(candidate.table_name.clone(), candidate.pg_id)) + .map(|content_json| Hit { + id: candidate.pg_id, + score: candidate.score, + content_json: content_json.clone(), + table_name: candidate.table_name, + }) }) .collect()) } @@ -414,45 +454,37 @@ impl SearcherService { } // Request scope - let requested_table = req.table_name.as_deref().map(str::trim); - let targets = resolve_search_targets(&self.pool, profile_name, requested_table).await?; - - if targets.is_empty() { - return Ok(Response::new(SearchResponse { hits: vec![] })); - } + let scope = + resolve_search_scope(&self.pool, profile_name, req.table_name.as_deref().map(str::trim)) + .await?; let query = req.query.trim(); if query.is_empty() { // Empty query - if targets.len() != 1 { + let Some(table_name) = scope.requested_table.as_deref() else { return Err(Status::invalid_argument( "table_name is required when query is empty", )); - } + }; - let hits = fetch_default_hits(&self.pool, &targets[0]).await?; + let hits = fetch_default_hits(&self.pool, &scope.profile_name, table_name).await?; info!( "Empty query for profile '{}' table '{}'. Returning {} default hits.", - profile_name, - targets[0].table_name, + scope.profile_name, + table_name, hits.len() ); return Ok(Response::new(SearchResponse { hits })); } - if requested_table.is_some() && targets.len() == 1 && !targets[0].index_path.exists() { + if scope.requested_table.is_some() && !scope.index_path.exists() { return Err(Status::not_found(format!( - "No search index found for table '{}'", - targets[0].table_name + "No search index found for profile '{}'", + scope.profile_name ))); } - // Merge per-table hits - let mut hits = Vec::new(); - for target in &targets { - hits.extend(search_target(&self.pool, target, query, mode).await?); - } - + let mut hits = search_profile(&self.pool, &scope, query, mode).await?; hits.sort_by(|left, right| right.score.total_cmp(&left.score)); if hits.len() > SEARCH_RESULT_LIMIT { hits.truncate(SEARCH_RESULT_LIMIT); @@ -464,8 +496,8 @@ impl SearcherService { SearchMode::Fuzzy => "fuzzy", SearchMode::Exact => "exact", }, - profile_name, - requested_table.unwrap_or("*"), + scope.profile_name, + scope.requested_table.as_deref().unwrap_or("*"), hits.len() );