search with multiquery redesigned

2026-04-29 19:56:17 +02:00
parent fb4769301c
commit b928004c76
7 changed files with 726 additions and 537 deletions
--- a/common/proto/search.proto
+++ b/common/proto/search.proto
@@ -3,15 +3,27 @@ syntax = "proto3";
 package komp_ac.search;

 service Searcher {
-  rpc SearchTable(SearchRequest) returns (SearchResponse);
-  rpc ExactSearchTable(SearchRequest) returns (SearchResponse);
+  rpc Search(SearchRequest) returns (SearchResponse);
+}
+
+enum MatchMode {
+  MATCH_MODE_UNSPECIFIED = 0;
+  MATCH_MODE_FUZZY = 1;
+  MATCH_MODE_EXACT = 2;
+}
+
+message ColumnConstraint {
+  string column = 1;
+  string query = 2;
+  MatchMode mode = 3;
 }

 message SearchRequest {
-  optional string table_name = 1;
-  string query = 2;
-  string profile_name = 3;
-  optional string column_name = 4;
+  string profile_name = 1;
+  optional string table_name = 2;
+  string free_query = 3;
+  repeated ColumnConstraint must = 4;
+  optional uint32 limit = 5;
 }
 message SearchResponse {
  message Hit {
--- a/common/src/proto/descriptor.bin
+++ b/common/src/proto/descriptor.bin
--- a/common/src/proto/komp_ac.search.rs
+++ b/common/src/proto/komp_ac.search.rs
@@ -1,14 +1,25 @@
 // This file is @generated by prost-build.
 #[derive(Clone, PartialEq, ::prost::Message)]
-pub struct SearchRequest {
-    #[prost(string, optional, tag = "1")]
-    pub table_name: ::core::option::Option<::prost::alloc::string::String>,
+pub struct ColumnConstraint {
+    #[prost(string, tag = "1")]
+    pub column: ::prost::alloc::string::String,
    #[prost(string, tag = "2")]
    pub query: ::prost::alloc::string::String,
-    #[prost(string, tag = "3")]
+    #[prost(enumeration = "MatchMode", tag = "3")]
+    pub mode: i32,
+}
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SearchRequest {
+    #[prost(string, tag = "1")]
    pub profile_name: ::prost::alloc::string::String,
-    #[prost(string, optional, tag = "4")]
-    pub column_name: ::core::option::Option<::prost::alloc::string::String>,
+    #[prost(string, optional, tag = "2")]
+    pub table_name: ::core::option::Option<::prost::alloc::string::String>,
+    #[prost(string, tag = "3")]
+    pub free_query: ::prost::alloc::string::String,
+    #[prost(message, repeated, tag = "4")]
+    pub must: ::prost::alloc::vec::Vec<ColumnConstraint>,
+    #[prost(uint32, optional, tag = "5")]
+    pub limit: ::core::option::Option<u32>,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
 pub struct SearchResponse {
@@ -30,6 +41,35 @@ pub mod search_response {
        pub table_name: ::prost::alloc::string::String,
    }
 }
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum MatchMode {
+    Unspecified = 0,
+    Fuzzy = 1,
+    Exact = 2,
+}
+impl MatchMode {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::Unspecified => "MATCH_MODE_UNSPECIFIED",
+            Self::Fuzzy => "MATCH_MODE_FUZZY",
+            Self::Exact => "MATCH_MODE_EXACT",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "MATCH_MODE_UNSPECIFIED" => Some(Self::Unspecified),
+            "MATCH_MODE_FUZZY" => Some(Self::Fuzzy),
+            "MATCH_MODE_EXACT" => Some(Self::Exact),
+            _ => None,
+        }
+    }
+}
 /// Generated client implementations.
 pub mod searcher_client {
    #![allow(
@@ -121,7 +161,7 @@ pub mod searcher_client {
            self.inner = self.inner.max_encoding_message_size(limit);
            self
        }
-        pub async fn search_table(
+        pub async fn search(
            &mut self,
            request: impl tonic::IntoRequest<super::SearchRequest>,
        ) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status> {
@@ -135,32 +175,11 @@ pub mod searcher_client {
                })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
-                "/komp_ac.search.Searcher/SearchTable",
+                "/komp_ac.search.Searcher/Search",
            );
            let mut req = request.into_request();
            req.extensions_mut()
-                .insert(GrpcMethod::new("komp_ac.search.Searcher", "SearchTable"));
-            self.inner.unary(req, path, codec).await
-        }
-        pub async fn exact_search_table(
-            &mut self,
-            request: impl tonic::IntoRequest<super::SearchRequest>,
-        ) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::unknown(
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/komp_ac.search.Searcher/ExactSearchTable",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("komp_ac.search.Searcher", "ExactSearchTable"));
+                .insert(GrpcMethod::new("komp_ac.search.Searcher", "Search"));
            self.inner.unary(req, path, codec).await
        }
    }
@@ -178,11 +197,7 @@ pub mod searcher_server {
    /// Generated trait containing gRPC methods that should be implemented for use with SearcherServer.
    #[async_trait]
    pub trait Searcher: std::marker::Send + std::marker::Sync + 'static {
-        async fn search_table(
-            &self,
-            request: tonic::Request<super::SearchRequest>,
-        ) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status>;
-        async fn exact_search_table(
+        async fn search(
            &self,
            request: tonic::Request<super::SearchRequest>,
        ) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status>;
@@ -263,11 +278,11 @@ pub mod searcher_server {
        }
        fn call(&mut self, req: http::Request<B>) -> Self::Future {
            match req.uri().path() {
-                "/komp_ac.search.Searcher/SearchTable" => {
+                "/komp_ac.search.Searcher/Search" => {
                    #[allow(non_camel_case_types)]
-                    struct SearchTableSvc<T: Searcher>(pub Arc<T>);
+                    struct SearchSvc<T: Searcher>(pub Arc<T>);
                    impl<T: Searcher> tonic::server::UnaryService<super::SearchRequest>
-                    for SearchTableSvc<T> {
+                    for SearchSvc<T> {
                        type Response = super::SearchResponse;
                        type Future = BoxFuture<
                            tonic::Response<Self::Response>,
@@ -279,7 +294,7 @@ pub mod searcher_server {
                        ) -> Self::Future {
                            let inner = Arc::clone(&self.0);
                            let fut = async move {
-                                <T as Searcher>::search_table(&inner, request).await
+                                <T as Searcher>::search(&inner, request).await
                            };
                            Box::pin(fut)
                        }
@@ -290,50 +305,7 @@ pub mod searcher_server {
                    let max_encoding_message_size = self.max_encoding_message_size;
                    let inner = self.inner.clone();
                    let fut = async move {
-                        let method = SearchTableSvc(inner);
-                        let codec = tonic::codec::ProstCodec::default();
-                        let mut grpc = tonic::server::Grpc::new(codec)
-                            .apply_compression_config(
-                                accept_compression_encodings,
-                                send_compression_encodings,
-                            )
-                            .apply_max_message_size_config(
-                                max_decoding_message_size,
-                                max_encoding_message_size,
-                            );
-                        let res = grpc.unary(method, req).await;
-                        Ok(res)
-                    };
-                    Box::pin(fut)
-                }
-                "/komp_ac.search.Searcher/ExactSearchTable" => {
-                    #[allow(non_camel_case_types)]
-                    struct ExactSearchTableSvc<T: Searcher>(pub Arc<T>);
-                    impl<T: Searcher> tonic::server::UnaryService<super::SearchRequest>
-                    for ExactSearchTableSvc<T> {
-                        type Response = super::SearchResponse;
-                        type Future = BoxFuture<
-                            tonic::Response<Self::Response>,
-                            tonic::Status,
-                        >;
-                        fn call(
-                            &mut self,
-                            request: tonic::Request<super::SearchRequest>,
-                        ) -> Self::Future {
-                            let inner = Arc::clone(&self.0);
-                            let fut = async move {
-                                <T as Searcher>::exact_search_table(&inner, request).await
-                            };
-                            Box::pin(fut)
-                        }
-                    }
-                    let accept_compression_encodings = self.accept_compression_encodings;
-                    let send_compression_encodings = self.send_compression_encodings;
-                    let max_decoding_message_size = self.max_decoding_message_size;
-                    let max_encoding_message_size = self.max_encoding_message_size;
-                    let inner = self.inner.clone();
-                    let fut = async move {
-                        let method = ExactSearchTableSvc(inner);
+                        let method = SearchSvc(inner);
                        let codec = tonic::codec::ProstCodec::default();
                        let mut grpc = tonic::server::Grpc::new(codec)
                            .apply_compression_config(
--- a/common/src/search.rs
+++ b/common/src/search.rs
@@ -1,10 +1,26 @@
-// common/src/search.rs
-
 use std::path::{Path, PathBuf};
-use tantivy::schema::*;
-use tantivy::tokenizer::*;
+
+use tantivy::schema::{
+    Field, IndexRecordOption, JsonObjectOptions, Schema, TextFieldIndexing, Term, INDEXED,
+    STORED, STRING,
+};
+use tantivy::tokenizer::{
+    AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
+    SimpleTokenizer, TextAnalyzer, TokenStream,
+};
 use tantivy::Index;

+pub const F_PG_ID: &str = "pg_id";
+pub const F_TABLE_NAME: &str = "table_name";
+pub const F_ROW_KEY: &str = "row_key";
+pub const F_DATA_WORD: &str = "data_word";
+pub const F_DATA_NGRAM: &str = "data_ngram";
+pub const F_DATA_EXACT: &str = "data_exact";
+
+pub const TOK_WORD: &str = "kw_word";
+pub const TOK_NGRAM: &str = "kw_ngram";
+pub const TOK_EXACT: &str = "kw_exact";
+
 /// Returns the on-disk path for a profile search index.
 pub fn search_index_path(root: &Path, profile_name: &str) -> PathBuf {
    root.join(profile_name)
@@ -15,117 +31,152 @@ pub fn search_row_key(table_name: &str, row_id: i64) -> String {
    format!("{}:{}", table_name, row_id)
 }

-/// Normalizes user-entered search text while preserving letter case.
-pub fn normalize_search_text(text: &str) -> String {
-    text.chars()
-        .map(|ch| match ch {
-            'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
-            'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
-            'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
-            'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
-            'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
-            'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
-            'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
-            'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
-            'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
-            'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
-            'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
-            'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
-            'č' => 'c',
-            'Č' => 'C',
-            'ď' => 'd',
-            'Ď' => 'D',
-            'ľ' => 'l',
-            'Ľ' => 'L',
-            'ň' => 'n',
-            'Ň' => 'N',
-            'ř' => 'r',
-            'Ř' => 'R',
-            'š' => 's',
-            'Š' => 'S',
-            'ť' => 't',
-            'Ť' => 'T',
-            'ž' => 'z',
-            'Ž' => 'Z',
-            _ => ch,
-        })
-        .collect()
+/// Normalizes user-entered values for exact-mode terms.
+pub fn normalize_exact(input: &str) -> String {
+    let trimmed = input.trim();
+    if trimmed.is_empty() {
+        return String::new();
+    }
+
+    let mut analyzer = exact_analyzer();
+    let mut stream = analyzer.token_stream(trimmed);
+    let mut out = String::with_capacity(trimmed.len());
+    while let Some(token) = stream.next() {
+        out.push_str(&token.text);
+    }
+    out
 }

-/// Normalizes an exact-match value so indexed data and user input use the same form.
-pub fn normalize_exact_value(text: &str) -> String {
-    normalize_search_text(text).to_lowercase()
+/// Normalizes a column name to the JSON-key form used at index time.
+pub fn normalize_column_name(column: &str) -> String {
+    column.to_ascii_lowercase()
 }

-/// Creates a hybrid Slovak search schema with optimized prefix fields.
+/// Creates the column-aware search schema.
 pub fn create_search_schema() -> Schema {
    let mut schema_builder = Schema::builder();

-    schema_builder.add_u64_field("pg_id", INDEXED | STORED);
-    schema_builder.add_text_field("table_name", STRING | STORED);
-    schema_builder.add_text_field("row_key", STRING | STORED);
-    schema_builder.add_text_field("column_exact", STRING);
+    schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
+    schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
+    schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);

-    // For prefixes (1-4 chars).
-    let short_prefix_indexing = TextFieldIndexing::default()
-        .set_tokenizer("slovak_prefix_edge")
-        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
-    let short_prefix_options = TextOptions::default()
-        .set_indexing_options(short_prefix_indexing)
-        .set_stored();
-    schema_builder.add_text_field("prefix_edge", short_prefix_options);
-
-    // For the full word.
-    let full_word_indexing = TextFieldIndexing::default()
-        .set_tokenizer("slovak_prefix_full")
-        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
-    let full_word_options = TextOptions::default()
-        .set_indexing_options(full_word_indexing)
-        .set_stored();
-    schema_builder.add_text_field("prefix_full", full_word_options);
-
-    // NGRAM FIELD: For substring matching.
-    let ngram_field_indexing = TextFieldIndexing::default()
-        .set_tokenizer("slovak_ngram")
-        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
-    let ngram_options = TextOptions::default()
-        .set_indexing_options(ngram_field_indexing)
-        .set_stored();
-    schema_builder.add_text_field("text_ngram", ngram_options);
+    schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
+    schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
+    schema_builder.add_json_field(F_DATA_EXACT, json_options(TOK_EXACT, false, false));

    schema_builder.build()
 }

-/// Registers all necessary Slovak tokenizers with the index.
-///
-/// This must be called by ANY process that opens the index
-/// to ensure the tokenizers are loaded into memory.
-pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
+fn json_options(
+    tokenizer_name: &str,
+    with_positions: bool,
+    stored: bool,
+) -> JsonObjectOptions {
+    let index_option = if with_positions {
+        IndexRecordOption::WithFreqsAndPositions
+    } else {
+        IndexRecordOption::Basic
+    };
+
+    let indexing = TextFieldIndexing::default()
+        .set_tokenizer(tokenizer_name)
+        .set_index_option(index_option);
+
+    let mut options = JsonObjectOptions::default().set_indexing_options(indexing);
+    if stored {
+        options = options.set_stored();
+    }
+    options
+}
+
+/// Registers all required tokenizers with the index.
+pub fn register_tokenizers(index: &Index) -> tantivy::Result<()> {
    let tokenizer_manager = index.tokenizers();

-    // TOKENIZER for `prefix_edge`: Edge N-gram (1-4 chars)
-    let edge_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 4, true)?)
-        .filter(RemoveLongFilter::limit(40))
-        .filter(LowerCaser)
-        .filter(AsciiFoldingFilter)
-        .build();
-    tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer);
-
-    // TOKENIZER for `prefix_full`: Simple word tokenizer
-    let full_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
-        .filter(RemoveLongFilter::limit(40))
-        .filter(LowerCaser)
-        .filter(AsciiFoldingFilter)
-        .build();
-    tokenizer_manager.register("slovak_prefix_full", full_tokenizer);
-
-    // NGRAM TOKENIZER: For substring matching.
-    let ngram_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
-        .filter(RemoveLongFilter::limit(40))
-        .filter(LowerCaser)
-        .filter(AsciiFoldingFilter)
-        .build();
-    tokenizer_manager.register("slovak_ngram", ngram_tokenizer);
+    tokenizer_manager.register(TOK_WORD, word_analyzer());
+    tokenizer_manager.register(TOK_NGRAM, ngram_analyzer()?);
+    tokenizer_manager.register(TOK_EXACT, exact_analyzer());

    Ok(())
 }
+
+fn word_analyzer() -> TextAnalyzer {
+    TextAnalyzer::builder(SimpleTokenizer::default())
+        .filter(RemoveLongFilter::limit(80))
+        .filter(LowerCaser)
+        .filter(AsciiFoldingFilter)
+        .build()
+}
+
+fn ngram_analyzer() -> tantivy::Result<TextAnalyzer> {
+    Ok(TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
+        .filter(RemoveLongFilter::limit(80))
+        .filter(LowerCaser)
+        .filter(AsciiFoldingFilter)
+        .build())
+}
+
+fn exact_analyzer() -> TextAnalyzer {
+    TextAnalyzer::builder(RawTokenizer::default())
+        .filter(LowerCaser)
+        .filter(AsciiFoldingFilter)
+        .build()
+}
+
+/// Tokenizes text the same way `data_word` is indexed.
+pub fn tokenize_word(text: &str) -> Vec<String> {
+    tokenize_with(word_analyzer(), text)
+}
+
+/// Tokenizes text the same way `data_ngram` is indexed.
+pub fn tokenize_ngram(text: &str) -> Vec<String> {
+    match ngram_analyzer() {
+        Ok(analyzer) => tokenize_with(analyzer, text),
+        Err(_) => Vec::new(),
+    }
+}
+
+fn tokenize_with(mut analyzer: TextAnalyzer, text: &str) -> Vec<String> {
+    let mut stream = analyzer.token_stream(text);
+    let mut out = Vec::new();
+    while let Some(token) = stream.next() {
+        out.push(token.text.clone());
+    }
+    out
+}
+
+/// Builds a term scoped to a specific JSON path within a JSON field.
+pub fn json_path_term(field: Field, column: &str, text: &str) -> Term {
+    let mut term = Term::from_field_json_path(field, column, false);
+    term.append_type_and_str(text);
+    term
+}
+
+/// Returns all required schema fields or fails loudly on mismatch.
+pub struct SchemaFields {
+    pub pg_id: Field,
+    pub table_name: Field,
+    pub row_key: Field,
+    pub data_word: Field,
+    pub data_ngram: Field,
+    pub data_exact: Field,
+}
+
+impl SchemaFields {
+    pub fn from(schema: &Schema) -> tantivy::Result<Self> {
+        Ok(Self {
+            pg_id: get_field(schema, F_PG_ID)?,
+            table_name: get_field(schema, F_TABLE_NAME)?,
+            row_key: get_field(schema, F_ROW_KEY)?,
+            data_word: get_field(schema, F_DATA_WORD)?,
+            data_ngram: get_field(schema, F_DATA_NGRAM)?,
+            data_exact: get_field(schema, F_DATA_EXACT)?,
+        })
+    }
+}
+
+fn get_field(schema: &Schema, name: &str) -> tantivy::Result<Field> {
+    schema.get_field(name).map_err(|e| {
+        tantivy::TantivyError::SchemaError(format!("schema is missing field '{name}': {e}"))
+    })
+}