search with multiquery redesigned

This commit is contained in:
Priec
2026-04-29 19:56:17 +02:00
parent fb4769301c
commit b928004c76
7 changed files with 726 additions and 537 deletions

View File

@@ -3,15 +3,27 @@ syntax = "proto3";
package komp_ac.search;
service Searcher {
rpc SearchTable(SearchRequest) returns (SearchResponse);
rpc ExactSearchTable(SearchRequest) returns (SearchResponse);
rpc Search(SearchRequest) returns (SearchResponse);
}
enum MatchMode {
MATCH_MODE_UNSPECIFIED = 0;
MATCH_MODE_FUZZY = 1;
MATCH_MODE_EXACT = 2;
}
message ColumnConstraint {
string column = 1;
string query = 2;
MatchMode mode = 3;
}
message SearchRequest {
optional string table_name = 1;
string query = 2;
string profile_name = 3;
optional string column_name = 4;
string profile_name = 1;
optional string table_name = 2;
string free_query = 3;
repeated ColumnConstraint must = 4;
optional uint32 limit = 5;
}
message SearchResponse {
message Hit {

Binary file not shown.

View File

@@ -1,14 +1,25 @@
// This file is @generated by prost-build.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SearchRequest {
#[prost(string, optional, tag = "1")]
pub table_name: ::core::option::Option<::prost::alloc::string::String>,
pub struct ColumnConstraint {
#[prost(string, tag = "1")]
pub column: ::prost::alloc::string::String,
#[prost(string, tag = "2")]
pub query: ::prost::alloc::string::String,
#[prost(string, tag = "3")]
#[prost(enumeration = "MatchMode", tag = "3")]
pub mode: i32,
}
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SearchRequest {
#[prost(string, tag = "1")]
pub profile_name: ::prost::alloc::string::String,
#[prost(string, optional, tag = "4")]
pub column_name: ::core::option::Option<::prost::alloc::string::String>,
#[prost(string, optional, tag = "2")]
pub table_name: ::core::option::Option<::prost::alloc::string::String>,
#[prost(string, tag = "3")]
pub free_query: ::prost::alloc::string::String,
#[prost(message, repeated, tag = "4")]
pub must: ::prost::alloc::vec::Vec<ColumnConstraint>,
#[prost(uint32, optional, tag = "5")]
pub limit: ::core::option::Option<u32>,
}
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SearchResponse {
@@ -30,6 +41,35 @@ pub mod search_response {
pub table_name: ::prost::alloc::string::String,
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
#[repr(i32)]
pub enum MatchMode {
Unspecified = 0,
Fuzzy = 1,
Exact = 2,
}
impl MatchMode {
/// String value of the enum field names used in the ProtoBuf definition.
///
/// The values are not transformed in any way and thus are considered stable
/// (if the ProtoBuf definition does not change) and safe for programmatic use.
pub fn as_str_name(&self) -> &'static str {
match self {
Self::Unspecified => "MATCH_MODE_UNSPECIFIED",
Self::Fuzzy => "MATCH_MODE_FUZZY",
Self::Exact => "MATCH_MODE_EXACT",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
match value {
"MATCH_MODE_UNSPECIFIED" => Some(Self::Unspecified),
"MATCH_MODE_FUZZY" => Some(Self::Fuzzy),
"MATCH_MODE_EXACT" => Some(Self::Exact),
_ => None,
}
}
}
/// Generated client implementations.
pub mod searcher_client {
#![allow(
@@ -121,7 +161,7 @@ pub mod searcher_client {
self.inner = self.inner.max_encoding_message_size(limit);
self
}
pub async fn search_table(
pub async fn search(
&mut self,
request: impl tonic::IntoRequest<super::SearchRequest>,
) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status> {
@@ -135,32 +175,11 @@ pub mod searcher_client {
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/komp_ac.search.Searcher/SearchTable",
"/komp_ac.search.Searcher/Search",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("komp_ac.search.Searcher", "SearchTable"));
self.inner.unary(req, path, codec).await
}
pub async fn exact_search_table(
&mut self,
request: impl tonic::IntoRequest<super::SearchRequest>,
) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status> {
self.inner
.ready()
.await
.map_err(|e| {
tonic::Status::unknown(
format!("Service was not ready: {}", e.into()),
)
})?;
let codec = tonic::codec::ProstCodec::default();
let path = http::uri::PathAndQuery::from_static(
"/komp_ac.search.Searcher/ExactSearchTable",
);
let mut req = request.into_request();
req.extensions_mut()
.insert(GrpcMethod::new("komp_ac.search.Searcher", "ExactSearchTable"));
.insert(GrpcMethod::new("komp_ac.search.Searcher", "Search"));
self.inner.unary(req, path, codec).await
}
}
@@ -178,11 +197,7 @@ pub mod searcher_server {
/// Generated trait containing gRPC methods that should be implemented for use with SearcherServer.
#[async_trait]
pub trait Searcher: std::marker::Send + std::marker::Sync + 'static {
async fn search_table(
&self,
request: tonic::Request<super::SearchRequest>,
) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status>;
async fn exact_search_table(
async fn search(
&self,
request: tonic::Request<super::SearchRequest>,
) -> std::result::Result<tonic::Response<super::SearchResponse>, tonic::Status>;
@@ -263,11 +278,11 @@ pub mod searcher_server {
}
fn call(&mut self, req: http::Request<B>) -> Self::Future {
match req.uri().path() {
"/komp_ac.search.Searcher/SearchTable" => {
"/komp_ac.search.Searcher/Search" => {
#[allow(non_camel_case_types)]
struct SearchTableSvc<T: Searcher>(pub Arc<T>);
struct SearchSvc<T: Searcher>(pub Arc<T>);
impl<T: Searcher> tonic::server::UnaryService<super::SearchRequest>
for SearchTableSvc<T> {
for SearchSvc<T> {
type Response = super::SearchResponse;
type Future = BoxFuture<
tonic::Response<Self::Response>,
@@ -279,7 +294,7 @@ pub mod searcher_server {
) -> Self::Future {
let inner = Arc::clone(&self.0);
let fut = async move {
<T as Searcher>::search_table(&inner, request).await
<T as Searcher>::search(&inner, request).await
};
Box::pin(fut)
}
@@ -290,50 +305,7 @@ pub mod searcher_server {
let max_encoding_message_size = self.max_encoding_message_size;
let inner = self.inner.clone();
let fut = async move {
let method = SearchTableSvc(inner);
let codec = tonic::codec::ProstCodec::default();
let mut grpc = tonic::server::Grpc::new(codec)
.apply_compression_config(
accept_compression_encodings,
send_compression_encodings,
)
.apply_max_message_size_config(
max_decoding_message_size,
max_encoding_message_size,
);
let res = grpc.unary(method, req).await;
Ok(res)
};
Box::pin(fut)
}
"/komp_ac.search.Searcher/ExactSearchTable" => {
#[allow(non_camel_case_types)]
struct ExactSearchTableSvc<T: Searcher>(pub Arc<T>);
impl<T: Searcher> tonic::server::UnaryService<super::SearchRequest>
for ExactSearchTableSvc<T> {
type Response = super::SearchResponse;
type Future = BoxFuture<
tonic::Response<Self::Response>,
tonic::Status,
>;
fn call(
&mut self,
request: tonic::Request<super::SearchRequest>,
) -> Self::Future {
let inner = Arc::clone(&self.0);
let fut = async move {
<T as Searcher>::exact_search_table(&inner, request).await
};
Box::pin(fut)
}
}
let accept_compression_encodings = self.accept_compression_encodings;
let send_compression_encodings = self.send_compression_encodings;
let max_decoding_message_size = self.max_decoding_message_size;
let max_encoding_message_size = self.max_encoding_message_size;
let inner = self.inner.clone();
let fut = async move {
let method = ExactSearchTableSvc(inner);
let method = SearchSvc(inner);
let codec = tonic::codec::ProstCodec::default();
let mut grpc = tonic::server::Grpc::new(codec)
.apply_compression_config(

View File

@@ -1,10 +1,26 @@
// common/src/search.rs
use std::path::{Path, PathBuf};
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::schema::{
Field, IndexRecordOption, JsonObjectOptions, Schema, TextFieldIndexing, Term, INDEXED,
STORED, STRING,
};
use tantivy::tokenizer::{
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
SimpleTokenizer, TextAnalyzer, TokenStream,
};
use tantivy::Index;
pub const F_PG_ID: &str = "pg_id";
pub const F_TABLE_NAME: &str = "table_name";
pub const F_ROW_KEY: &str = "row_key";
pub const F_DATA_WORD: &str = "data_word";
pub const F_DATA_NGRAM: &str = "data_ngram";
pub const F_DATA_EXACT: &str = "data_exact";
pub const TOK_WORD: &str = "kw_word";
pub const TOK_NGRAM: &str = "kw_ngram";
pub const TOK_EXACT: &str = "kw_exact";
/// Returns the on-disk path for a profile search index.
pub fn search_index_path(root: &Path, profile_name: &str) -> PathBuf {
root.join(profile_name)
@@ -15,117 +31,152 @@ pub fn search_row_key(table_name: &str, row_id: i64) -> String {
format!("{}:{}", table_name, row_id)
}
/// Normalizes user-entered search text while preserving letter case.
pub fn normalize_search_text(text: &str) -> String {
text.chars()
.map(|ch| match ch {
'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
'č' => 'c',
'Č' => 'C',
'ď' => 'd',
'Ď' => 'D',
'ľ' => 'l',
'Ľ' => 'L',
'ň' => 'n',
'Ň' => 'N',
'ř' => 'r',
'Ř' => 'R',
'š' => 's',
'Š' => 'S',
'ť' => 't',
'Ť' => 'T',
'ž' => 'z',
'Ž' => 'Z',
_ => ch,
})
.collect()
/// Normalizes user-entered values for exact-mode terms.
pub fn normalize_exact(input: &str) -> String {
let trimmed = input.trim();
if trimmed.is_empty() {
return String::new();
}
let mut analyzer = exact_analyzer();
let mut stream = analyzer.token_stream(trimmed);
let mut out = String::with_capacity(trimmed.len());
while let Some(token) = stream.next() {
out.push_str(&token.text);
}
out
}
/// Normalizes an exact-match value so indexed data and user input use the same form.
pub fn normalize_exact_value(text: &str) -> String {
normalize_search_text(text).to_lowercase()
/// Normalizes a column name to the JSON-key form used at index time.
pub fn normalize_column_name(column: &str) -> String {
column.to_ascii_lowercase()
}
/// Creates a hybrid Slovak search schema with optimized prefix fields.
/// Creates the column-aware search schema.
pub fn create_search_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
schema_builder.add_text_field("table_name", STRING | STORED);
schema_builder.add_text_field("row_key", STRING | STORED);
schema_builder.add_text_field("column_exact", STRING);
schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
// For prefixes (1-4 chars).
let short_prefix_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_edge")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let short_prefix_options = TextOptions::default()
.set_indexing_options(short_prefix_indexing)
.set_stored();
schema_builder.add_text_field("prefix_edge", short_prefix_options);
// For the full word.
let full_word_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_full")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let full_word_options = TextOptions::default()
.set_indexing_options(full_word_indexing)
.set_stored();
schema_builder.add_text_field("prefix_full", full_word_options);
// NGRAM FIELD: For substring matching.
let ngram_field_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_ngram")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let ngram_options = TextOptions::default()
.set_indexing_options(ngram_field_indexing)
.set_stored();
schema_builder.add_text_field("text_ngram", ngram_options);
schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
schema_builder.add_json_field(F_DATA_EXACT, json_options(TOK_EXACT, false, false));
schema_builder.build()
}
/// Registers all necessary Slovak tokenizers with the index.
///
/// This must be called by ANY process that opens the index
/// to ensure the tokenizers are loaded into memory.
pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
fn json_options(
tokenizer_name: &str,
with_positions: bool,
stored: bool,
) -> JsonObjectOptions {
let index_option = if with_positions {
IndexRecordOption::WithFreqsAndPositions
} else {
IndexRecordOption::Basic
};
let indexing = TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(index_option);
let mut options = JsonObjectOptions::default().set_indexing_options(indexing);
if stored {
options = options.set_stored();
}
options
}
/// Registers all required tokenizers with the index.
pub fn register_tokenizers(index: &Index) -> tantivy::Result<()> {
let tokenizer_manager = index.tokenizers();
// TOKENIZER for `prefix_edge`: Edge N-gram (1-4 chars)
let edge_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 4, true)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer);
// TOKENIZER for `prefix_full`: Simple word tokenizer
let full_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_full", full_tokenizer);
// NGRAM TOKENIZER: For substring matching.
let ngram_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_ngram", ngram_tokenizer);
tokenizer_manager.register(TOK_WORD, word_analyzer());
tokenizer_manager.register(TOK_NGRAM, ngram_analyzer()?);
tokenizer_manager.register(TOK_EXACT, exact_analyzer());
Ok(())
}
fn word_analyzer() -> TextAnalyzer {
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(80))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build()
}
fn ngram_analyzer() -> tantivy::Result<TextAnalyzer> {
Ok(TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
.filter(RemoveLongFilter::limit(80))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build())
}
fn exact_analyzer() -> TextAnalyzer {
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build()
}
/// Tokenizes text the same way `data_word` is indexed.
pub fn tokenize_word(text: &str) -> Vec<String> {
tokenize_with(word_analyzer(), text)
}
/// Tokenizes text the same way `data_ngram` is indexed.
pub fn tokenize_ngram(text: &str) -> Vec<String> {
match ngram_analyzer() {
Ok(analyzer) => tokenize_with(analyzer, text),
Err(_) => Vec::new(),
}
}
fn tokenize_with(mut analyzer: TextAnalyzer, text: &str) -> Vec<String> {
let mut stream = analyzer.token_stream(text);
let mut out = Vec::new();
while let Some(token) = stream.next() {
out.push(token.text.clone());
}
out
}
/// Builds a term scoped to a specific JSON path within a JSON field.
pub fn json_path_term(field: Field, column: &str, text: &str) -> Term {
let mut term = Term::from_field_json_path(field, column, false);
term.append_type_and_str(text);
term
}
/// Returns all required schema fields or fails loudly on mismatch.
pub struct SchemaFields {
pub pg_id: Field,
pub table_name: Field,
pub row_key: Field,
pub data_word: Field,
pub data_ngram: Field,
pub data_exact: Field,
}
impl SchemaFields {
pub fn from(schema: &Schema) -> tantivy::Result<Self> {
Ok(Self {
pg_id: get_field(schema, F_PG_ID)?,
table_name: get_field(schema, F_TABLE_NAME)?,
row_key: get_field(schema, F_ROW_KEY)?,
data_word: get_field(schema, F_DATA_WORD)?,
data_ngram: get_field(schema, F_DATA_NGRAM)?,
data_exact: get_field(schema, F_DATA_EXACT)?,
})
}
}
fn get_field(schema: &Schema, name: &str) -> tantivy::Result<Field> {
schema.get_field(name).map_err(|e| {
tantivy::TantivyError::SchemaError(format!("schema is missing field '{name}': {e}"))
})
}