refactoring search based on the profile

2026-04-29 00:38:42 +02:00
parent 1867de513d
commit 5de1cd7623
8 changed files with 365 additions and 207 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -493,7 +493,7 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"

 [[package]]
 name = "canvas"
-version = "0.6.2"
+version = "0.6.3"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -585,7 +585,7 @@ dependencies = [

 [[package]]
 name = "client"
-version = "0.6.2"
+version = "0.6.3"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -641,7 +641,7 @@ dependencies = [

 [[package]]
 name = "common"
-version = "0.6.2"
+version = "0.6.3"
 dependencies = [
 "prost 0.13.5",
 "prost-build 0.14.1",
@@ -3116,7 +3116,7 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"

 [[package]]
 name = "search"
-version = "0.6.2"
+version = "0.6.3"
 dependencies = [
 "anyhow",
 "common",
@@ -3215,7 +3215,7 @@ dependencies = [

 [[package]]
 name = "server"
-version = "0.6.2"
+version = "0.6.3"
 dependencies = [
 "anyhow",
 "bcrypt",
--- a/common/proto/search.proto
+++ b/common/proto/search.proto
@@ -7,14 +7,16 @@ service Searcher {
 }

 message SearchRequest {
-  string table_name = 1;
+  optional string table_name = 1;
  string query = 2;
+  string profile_name = 3;
 }
 message SearchResponse {
  message Hit {
    int64 id = 1; // PostgreSQL row ID
    float score = 2;
    string content_json = 3;
+    string table_name = 4;
  }
  repeated Hit hits = 1;
 }
--- a/common/src/proto/descriptor.bin
+++ b/common/src/proto/descriptor.bin
--- a/common/src/proto/komp_ac.search.rs
+++ b/common/src/proto/komp_ac.search.rs
@@ -1,10 +1,12 @@
 // This file is @generated by prost-build.
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SearchRequest {
-    #[prost(string, tag = "1")]
-    pub table_name: ::prost::alloc::string::String,
+    #[prost(string, optional, tag = "1")]
+    pub table_name: ::core::option::Option<::prost::alloc::string::String>,
    #[prost(string, tag = "2")]
    pub query: ::prost::alloc::string::String,
+    #[prost(string, tag = "3")]
+    pub profile_name: ::prost::alloc::string::String,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SearchResponse {
@@ -22,6 +24,8 @@ pub mod search_response {
        pub score: f32,
        #[prost(string, tag = "3")]
        pub content_json: ::prost::alloc::string::String,
+        #[prost(string, tag = "4")]
+        pub table_name: ::prost::alloc::string::String,
    }
 }
 /// Generated client implementations.
--- a/common/src/search.rs
+++ b/common/src/search.rs
@@ -1,16 +1,22 @@
 // common/src/search.rs

+use std::path::{Path, PathBuf};
 use tantivy::schema::*;
 use tantivy::tokenizer::*;
 use tantivy::Index;

+/// Returns the on-disk path for a profile/table search index.
+pub fn search_index_path(root: &Path, profile_name: &str, table_name: &str) -> PathBuf {
+    root.join(profile_name).join(table_name)
+}
+
 /// Creates a hybrid Slovak search schema with optimized prefix fields.
 pub fn create_search_schema() -> Schema {
    let mut schema_builder = Schema::builder();

    schema_builder.add_u64_field("pg_id", INDEXED | STORED);

-    // FIELD 1: For prefixes (1-4 chars).
+    // For prefixes (1-4 chars).
    let short_prefix_indexing = TextFieldIndexing::default()
        .set_tokenizer("slovak_prefix_edge")
        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
@@ -19,7 +25,7 @@ pub fn create_search_schema() -> Schema {
        .set_stored();
    schema_builder.add_text_field("prefix_edge", short_prefix_options);

-    // FIELD 2: For the full word.
+    // For the full word.
    let full_word_indexing = TextFieldIndexing::default()
        .set_tokenizer("slovak_prefix_full")
        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
--- a/search/.gitignore
+++ b/search/.gitignore
@@ -0,0 +1 @@
+.codex
--- a/search/src/lib.rs
+++ b/search/src/lib.rs
@@ -1,7 +1,6 @@
-// src/lib.rs
-
 use std::collections::HashMap;
-use std::path::Path;
+use std::path::{Path, PathBuf};
+
 use tantivy::collector::TopDocs;
 use tantivy::query::{
    BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery,
@@ -11,20 +10,27 @@ use tantivy::{Index, TantivyDocument, Term};
 use tonic::{Request, Response, Status};

 use common::proto::komp_ac::search::searcher_server::Searcher;
-pub use common::proto::komp_ac::search::searcher_server::SearcherServer;
 use common::proto::komp_ac::search::{search_response::Hit, SearchRequest, SearchResponse};
-use common::search::register_slovak_tokenizers;
+pub use common::proto::komp_ac::search::searcher_server::SearcherServer;
+use common::search::{register_slovak_tokenizers, search_index_path};
 use sqlx::{PgPool, Row};
 use tracing::info;

-// We need to hold the database pool in our service struct.
+const INDEX_ROOT: &str = "./tantivy_indexes";
+const DEFAULT_RESULT_LIMIT: usize = 5;
+const SEARCH_RESULT_LIMIT: usize = 100;
+
 pub struct SearcherService {
    pub pool: PgPool,
 }

-// normalize_slovak_text function remains unchanged...
+struct SearchTarget {
+    table_name: String,
+    qualified_table: String,
+    index_path: PathBuf,
+}
+
 fn normalize_slovak_text(text: &str) -> String {
-    // ... function content is unchanged ...
    text.chars()
        .map(|c| match c {
            'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
@@ -60,94 +66,125 @@ fn normalize_slovak_text(text: &str) -> String {
        .collect()
 }

-#[tonic::async_trait]
-impl Searcher for SearcherService {
-    async fn search_table(
-        &self,
-        request: Request<SearchRequest>,
-    ) -> Result<Response<SearchResponse>, Status> {
-        let req = request.into_inner();
-        let table_name = req.table_name;
-        let query_str = req.query;
+fn validate_identifier(value: &str, field_name: &str) -> Result<(), Status> {
+    let mut chars = value.chars();
+    let Some(first) = chars.next() else {
+        return Err(Status::invalid_argument(format!(
+            "{field_name} must not be empty"
+        )));
+    };

-        // --- MODIFIED LOGIC ---
-        // If the query is empty, fetch the 5 most recent records.
-        if query_str.trim().is_empty() {
-            info!(
-                "Empty query for table '{}'. Fetching default results.",
-                table_name
-            );
-            let qualified_table = format!("gen.\"{}\"", table_name);
-            let sql = format!(
-                "SELECT id, to_jsonb(t) AS data FROM {} t ORDER BY id DESC LIMIT 5",
-                qualified_table
-            );
-
-            let rows = sqlx::query(&sql).fetch_all(&self.pool).await.map_err(|e| {
-                Status::internal(format!("DB query for default results failed: {}", e))
-            })?;
-
-            let hits: Vec<Hit> = rows
-                .into_iter()
-                .map(|row| {
-                    let id: i64 = row.try_get("id").unwrap_or_default();
-                    let json_data: serde_json::Value = row.try_get("data").unwrap_or_default();
-                    Hit {
-                        id,
-                        // Score is 0.0 as this is not a relevance-ranked search
-                        score: 0.0,
-                        content_json: json_data.to_string(),
-                    }
-                })
-                .collect();
-
-            info!(
-                "--- SERVER: Successfully processed empty query. Returning {} default hits. ---",
-                hits.len()
-            );
-            return Ok(Response::new(SearchResponse { hits }));
-        }
-        // --- END OF MODIFIED LOGIC ---
-
-        let index_path = Path::new("./tantivy_indexes").join(&table_name);
-        if !index_path.exists() {
-            return Err(Status::not_found(format!(
-                "No search index found for table '{}'",
-                table_name
+    if !(first.is_ascii_alphabetic() || first == '_')
+        || !chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
+    {
+        return Err(Status::invalid_argument(format!(
+            "{field_name} contains invalid characters"
        )));
    }

-        let index = Index::open_in_dir(&index_path)
-            .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
+    Ok(())
+}

-        register_slovak_tokenizers(&index).map_err(|e| {
-            Status::internal(format!("Failed to register Slovak tokenizers: {}", e))
+fn qualify_profile_table(profile_name: &str, table_name: &str) -> String {
+    format!("\"{}\".\"{}\"", profile_name, table_name)
+}
+
+async fn profile_exists(pool: &PgPool, profile_name: &str) -> Result<bool, Status> {
+    let exists = sqlx::query_scalar::<_, bool>("SELECT EXISTS(SELECT 1 FROM schemas WHERE name = $1)")
+        .bind(profile_name)
+        .fetch_one(pool)
+        .await
+        .map_err(|e| Status::internal(format!("Profile lookup failed: {}", e)))?;
+    Ok(exists)
+}
+
+// Scope resolution
+async fn resolve_search_targets(
+    pool: &PgPool,
+    profile_name: &str,
+    requested_table: Option<&str>,
+) -> Result<Vec<SearchTarget>, Status> {
+    validate_identifier(profile_name, "profile_name")?;
+
+    if !profile_exists(pool, profile_name).await? {
+        return Err(Status::not_found(format!(
+            "Profile '{}' was not found",
+            profile_name
+        )));
+    }
+
+    let tables = if let Some(table_name) = requested_table.filter(|value| !value.trim().is_empty()) {
+        validate_identifier(table_name, "table_name")?;
+
+        let row = sqlx::query_scalar::<_, String>(
+            r#"
+            SELECT td.table_name
+            FROM table_definitions td
+            JOIN schemas s ON td.schema_id = s.id
+            WHERE s.name = $1 AND td.table_name = $2
+            "#,
+        )
+        .bind(profile_name)
+        .bind(table_name)
+        .fetch_optional(pool)
+        .await
+        .map_err(|e| Status::internal(format!("Table lookup failed: {}", e)))?;
+
+        let table_name = row.ok_or_else(|| {
+            Status::not_found(format!(
+                "Table '{}' was not found in profile '{}'",
+                table_name, profile_name
+            ))
        })?;

-        let reader = index
-            .reader()
-            .map_err(|e| Status::internal(format!("Failed to create index reader: {}", e)))?;
-        let searcher = reader.searcher();
+        vec![table_name]
+    } else {
+        sqlx::query_scalar::<_, String>(
+            r#"
+            SELECT td.table_name
+            FROM table_definitions td
+            JOIN schemas s ON td.schema_id = s.id
+            WHERE s.name = $1
+            ORDER BY td.table_name
+            "#,
+        )
+        .bind(profile_name)
+        .fetch_all(pool)
+        .await
+        .map_err(|e| Status::internal(format!("Profile table lookup failed: {}", e)))?
+    };
+
+    Ok(tables
+        .into_iter()
+        .map(|table_name| SearchTarget {
+            qualified_table: qualify_profile_table(profile_name, &table_name),
+            index_path: search_index_path(Path::new(INDEX_ROOT), profile_name, &table_name),
+            table_name,
+        })
+        .collect())
+}
+
+// Query building
+fn build_query(index: &Index, normalized_query: &str) -> Result<Option<BooleanQuery>, Status> {
    let schema = index.schema();
+    let prefix_edge_field = schema
+        .get_field("prefix_edge")
+        .map_err(|_| Status::internal("Schema is missing the 'prefix_edge' field."))?;
+    let prefix_full_field = schema
+        .get_field("prefix_full")
+        .map_err(|_| Status::internal("Schema is missing the 'prefix_full' field."))?;
+    let text_ngram_field = schema
+        .get_field("text_ngram")
+        .map_err(|_| Status::internal("Schema is missing the 'text_ngram' field."))?;

-        let pg_id_field = schema
-            .get_field("pg_id")
-            .map_err(|_| Status::internal("Schema is missing the 'pg_id' field."))?;
-
-        // --- Query Building Logic (no changes here) ---
-        let prefix_edge_field = schema.get_field("prefix_edge").unwrap();
-        let prefix_full_field = schema.get_field("prefix_full").unwrap();
-        let text_ngram_field = schema.get_field("text_ngram").unwrap();
-        let normalized_query = normalize_slovak_text(&query_str);
    let words: Vec<&str> = normalized_query.split_whitespace().collect();
    if words.is_empty() {
-            return Ok(Response::new(SearchResponse { hits: vec![] }));
+        return Ok(None);
    }
+
    let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
-        // ... all your query building layers remain exactly the same ...
-        // ===============================
-        // LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
-        // ===============================
+
+    // Layer 1: prefix
    {
        let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
        for word in &words {
@@ -164,7 +201,7 @@ impl Searcher for SearcherService {
                    Box::new(TermQuery::new(full_term, IndexRecordOption::Basic)),
                ),
            ]);
-                must_clauses.push((Occur::Must, Box::new(per_word_query) as Box<dyn Query>));
+            must_clauses.push((Occur::Must, Box::new(per_word_query)));
        }

        if !must_clauses.is_empty() {
@@ -174,22 +211,20 @@ impl Searcher for SearcherService {
        }
    }

-        // ===============================
-        // LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0)
-        // ===============================
+    // Layer 2: fuzzy
    {
-            let last_word = words.last().unwrap();
+        let last_word = words
+            .last()
+            .ok_or_else(|| Status::internal("Query normalization lost all tokens"))?;
        let fuzzy_term = Term::from_field_text(prefix_full_field, last_word);
        let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true);
        let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0);
        query_layers.push((Occur::Should, Box::new(boosted_query)));
    }

-        // ===============================
-        // LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0)
-        // ===============================
+    // Layer 3: phrase
    if words.len() > 1 {
-            let slop_parser = QueryParser::for_index(&index, vec![prefix_full_field]);
+        let slop_parser = QueryParser::for_index(index, vec![prefix_full_field]);
        let slop_query_str = format!("\"{}\"~3", normalized_query);
        if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) {
            let boosted_query = BoostQuery::new(slop_query, 2.0);
@@ -197,30 +232,81 @@ impl Searcher for SearcherService {
        }
    }

-        // ===============================
-        // LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0)
-        // ===============================
+    // Layer 4: ngram
    {
-            let ngram_parser = QueryParser::for_index(&index, vec![text_ngram_field]);
-            if let Ok(ngram_query) = ngram_parser.parse_query(&normalized_query) {
+        let ngram_parser = QueryParser::for_index(index, vec![text_ngram_field]);
+        if let Ok(ngram_query) = ngram_parser.parse_query(normalized_query) {
            let boosted_query = BoostQuery::new(ngram_query, 1.0);
            query_layers.push((Occur::Should, Box::new(boosted_query)));
        }
    }
-        let master_query = BooleanQuery::new(query_layers);
-        // --- End of Query Building Logic ---
+
+    Ok(Some(BooleanQuery::new(query_layers)))
+}
+
+// Empty query
+async fn fetch_default_hits(pool: &PgPool, target: &SearchTarget) -> Result<Vec<Hit>, Status> {
+    let sql = format!(
+        "SELECT id, to_jsonb(t) AS data FROM {} t WHERE deleted = FALSE ORDER BY id DESC LIMIT {}",
+        target.qualified_table, DEFAULT_RESULT_LIMIT
+    );
+
+    let rows = sqlx::query(&sql)
+        .fetch_all(pool)
+        .await
+        .map_err(|e| Status::internal(format!("DB query for default results failed: {}", e)))?;
+
+    Ok(rows
+        .into_iter()
+        .map(|row| {
+            let id: i64 = row.try_get("id").unwrap_or_default();
+            let json_data: serde_json::Value = row.try_get("data").unwrap_or_default();
+            Hit {
+                id,
+                score: 0.0,
+                content_json: json_data.to_string(),
+                table_name: target.table_name.clone(),
+            }
+        })
+        .collect())
+}
+
+// Search + hydrate
+async fn search_target(
+    pool: &PgPool,
+    target: &SearchTarget,
+    query_str: &str,
+) -> Result<Vec<Hit>, Status> {
+    if !target.index_path.exists() {
+        return Ok(vec![]);
+    }
+
+    let index = Index::open_in_dir(&target.index_path)
+        .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
+    register_slovak_tokenizers(&index)
+        .map_err(|e| Status::internal(format!("Failed to register Slovak tokenizers: {}", e)))?;
+
+    let Some(master_query) = build_query(&index, &normalize_slovak_text(query_str))? else {
+        return Ok(vec![]);
+    };
+
+    let reader = index
+        .reader()
+        .map_err(|e| Status::internal(format!("Failed to create index reader: {}", e)))?;
+    let searcher = reader.searcher();
+    let schema = index.schema();
+    let pg_id_field = schema
+        .get_field("pg_id")
+        .map_err(|_| Status::internal("Schema is missing the 'pg_id' field."))?;

    let top_docs = searcher
-            .search(&master_query, &TopDocs::with_limit(100))
+        .search(&master_query, &TopDocs::with_limit(SEARCH_RESULT_LIMIT))
        .map_err(|e| Status::internal(format!("Search failed: {}", e)))?;

    if top_docs.is_empty() {
-            return Ok(Response::new(SearchResponse { hits: vec![] }));
+        return Ok(vec![]);
    }

-        // --- NEW LOGIC: Fetch from DB and combine results ---
-
-        // Step 1: Extract (score, pg_id) from Tantivy results.
    let mut scored_ids: Vec<(f32, u64)> = Vec::new();
    for (score, doc_address) in top_docs {
        let doc: TantivyDocument = searcher
@@ -233,47 +319,106 @@ impl Searcher for SearcherService {
        }
    }

-        // Step 2: Fetch all corresponding rows from Postgres in a single query.
+    if scored_ids.is_empty() {
+        return Ok(vec![]);
+    }
+
    let pg_ids: Vec<i64> = scored_ids.iter().map(|(_, id)| *id as i64).collect();
-        let qualified_table = format!("gen.\"{}\"", table_name);
-        let query_str = format!(
-            "SELECT id, to_jsonb(t) AS data FROM {} t WHERE id = ANY($1)",
-            qualified_table
+    let sql = format!(
+        "SELECT id, to_jsonb(t) AS data FROM {} t WHERE deleted = FALSE AND id = ANY($1)",
+        target.qualified_table
    );

-        let rows = sqlx::query(&query_str)
+    let rows = sqlx::query(&sql)
        .bind(&pg_ids)
-            .fetch_all(&self.pool)
+        .fetch_all(pool)
        .await
        .map_err(|e| Status::internal(format!("Database query failed: {}", e)))?;

-        // Step 3: Map the database results by ID for quick lookup.
    let mut content_map: HashMap<i64, String> = HashMap::new();
    for row in rows {
-            let id: i64 = row.try_get("id").unwrap_or(0);
-            let json_data: serde_json::Value =
-                row.try_get("data").unwrap_or(serde_json::Value::Null);
+        let id: i64 = row.try_get("id").unwrap_or_default();
+        let json_data: serde_json::Value = row.try_get("data").unwrap_or_default();
        content_map.insert(id, json_data.to_string());
    }

-        // Step 4: Build the final response, combining Tantivy scores with PG content.
-        let hits: Vec<Hit> = scored_ids
+    Ok(scored_ids
        .into_iter()
        .filter_map(|(score, pg_id)| {
            content_map.get(&(pg_id as i64)).map(|content_json| Hit {
                id: pg_id as i64,
                score,
                content_json: content_json.clone(),
+                table_name: target.table_name.clone(),
            })
        })
-            .collect();
+        .collect())
+}
+
+#[tonic::async_trait]
+impl Searcher for SearcherService {
+    async fn search_table(
+        &self,
+        request: Request<SearchRequest>,
+    ) -> Result<Response<SearchResponse>, Status> {
+        let req = request.into_inner();
+        let profile_name = req.profile_name.trim();
+        if profile_name.is_empty() {
+            return Err(Status::invalid_argument("profile_name is required"));
+        }
+
+        // Request scope
+        let requested_table = req.table_name.as_deref().map(str::trim);
+        let targets = resolve_search_targets(&self.pool, profile_name, requested_table).await?;
+
+        if targets.is_empty() {
+            return Ok(Response::new(SearchResponse { hits: vec![] }));
+        }
+
+        let query = req.query.trim();
+        if query.is_empty() {
+            // Empty query
+            if targets.len() != 1 {
+                return Err(Status::invalid_argument(
+                    "table_name is required when query is empty",
+                ));
+            }
+
+            let hits = fetch_default_hits(&self.pool, &targets[0]).await?;
+            info!(
+                "Empty query for profile '{}' table '{}'. Returning {} default hits.",
+                profile_name,
+                targets[0].table_name,
+                hits.len()
+            );
+            return Ok(Response::new(SearchResponse { hits }));
+        }
+
+        if requested_table.is_some() && targets.len() == 1 && !targets[0].index_path.exists() {
+            return Err(Status::not_found(format!(
+                "No search index found for table '{}'",
+                targets[0].table_name
+            )));
+        }
+
+        // Merge per-table hits
+        let mut hits = Vec::new();
+        for target in &targets {
+            hits.extend(search_target(&self.pool, target, query).await?);
+        }
+
+        hits.sort_by(|left, right| right.score.total_cmp(&left.score));
+        if hits.len() > SEARCH_RESULT_LIMIT {
+            hits.truncate(SEARCH_RESULT_LIMIT);
+        }

        info!(
-            "--- SERVER: Successfully processed search. Returning {} hits. ---",
+            "Processed search for profile '{}' (table scope: {}). Returning {} hits.",
+            profile_name,
+            requested_table.unwrap_or("*"),
            hits.len()
        );

-        let response = SearchResponse { hits };
-        Ok(Response::new(response))
+        Ok(Response::new(SearchResponse { hits }))
    }
 }
--- a/2
+++ b/2