search with multiquery redesigned

This commit is contained in:
Priec
2026-04-29 19:56:17 +02:00
parent fb4769301c
commit b928004c76
7 changed files with 726 additions and 537 deletions

View File

@@ -1,10 +1,26 @@
// common/src/search.rs
use std::path::{Path, PathBuf};
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::schema::{
Field, IndexRecordOption, JsonObjectOptions, Schema, TextFieldIndexing, Term, INDEXED,
STORED, STRING,
};
use tantivy::tokenizer::{
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter,
SimpleTokenizer, TextAnalyzer, TokenStream,
};
use tantivy::Index;
pub const F_PG_ID: &str = "pg_id";
pub const F_TABLE_NAME: &str = "table_name";
pub const F_ROW_KEY: &str = "row_key";
pub const F_DATA_WORD: &str = "data_word";
pub const F_DATA_NGRAM: &str = "data_ngram";
pub const F_DATA_EXACT: &str = "data_exact";
pub const TOK_WORD: &str = "kw_word";
pub const TOK_NGRAM: &str = "kw_ngram";
pub const TOK_EXACT: &str = "kw_exact";
/// Returns the on-disk path for a profile search index.
pub fn search_index_path(root: &Path, profile_name: &str) -> PathBuf {
root.join(profile_name)
@@ -15,117 +31,152 @@ pub fn search_row_key(table_name: &str, row_id: i64) -> String {
format!("{}:{}", table_name, row_id)
}
/// Normalizes user-entered search text while preserving letter case.
pub fn normalize_search_text(text: &str) -> String {
text.chars()
.map(|ch| match ch {
'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
'č' => 'c',
'Č' => 'C',
'ď' => 'd',
'Ď' => 'D',
'ľ' => 'l',
'Ľ' => 'L',
'ň' => 'n',
'Ň' => 'N',
'ř' => 'r',
'Ř' => 'R',
'š' => 's',
'Š' => 'S',
'ť' => 't',
'Ť' => 'T',
'ž' => 'z',
'Ž' => 'Z',
_ => ch,
})
.collect()
/// Normalizes user-entered values for exact-mode terms.
pub fn normalize_exact(input: &str) -> String {
let trimmed = input.trim();
if trimmed.is_empty() {
return String::new();
}
let mut analyzer = exact_analyzer();
let mut stream = analyzer.token_stream(trimmed);
let mut out = String::with_capacity(trimmed.len());
while let Some(token) = stream.next() {
out.push_str(&token.text);
}
out
}
/// Normalizes an exact-match value so indexed data and user input use the same form.
pub fn normalize_exact_value(text: &str) -> String {
normalize_search_text(text).to_lowercase()
/// Normalizes a column name to the JSON-key form used at index time.
pub fn normalize_column_name(column: &str) -> String {
column.to_ascii_lowercase()
}
/// Creates a hybrid Slovak search schema with optimized prefix fields.
/// Creates the column-aware search schema.
pub fn create_search_schema() -> Schema {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
schema_builder.add_text_field("table_name", STRING | STORED);
schema_builder.add_text_field("row_key", STRING | STORED);
schema_builder.add_text_field("column_exact", STRING);
schema_builder.add_u64_field(F_PG_ID, INDEXED | STORED);
schema_builder.add_text_field(F_TABLE_NAME, STRING | STORED);
schema_builder.add_text_field(F_ROW_KEY, STRING | STORED);
// For prefixes (1-4 chars).
let short_prefix_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_edge")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let short_prefix_options = TextOptions::default()
.set_indexing_options(short_prefix_indexing)
.set_stored();
schema_builder.add_text_field("prefix_edge", short_prefix_options);
// For the full word.
let full_word_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_full")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let full_word_options = TextOptions::default()
.set_indexing_options(full_word_indexing)
.set_stored();
schema_builder.add_text_field("prefix_full", full_word_options);
// NGRAM FIELD: For substring matching.
let ngram_field_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_ngram")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let ngram_options = TextOptions::default()
.set_indexing_options(ngram_field_indexing)
.set_stored();
schema_builder.add_text_field("text_ngram", ngram_options);
schema_builder.add_json_field(F_DATA_WORD, json_options(TOK_WORD, true, false));
schema_builder.add_json_field(F_DATA_NGRAM, json_options(TOK_NGRAM, true, false));
schema_builder.add_json_field(F_DATA_EXACT, json_options(TOK_EXACT, false, false));
schema_builder.build()
}
/// Registers all necessary Slovak tokenizers with the index.
///
/// This must be called by ANY process that opens the index
/// to ensure the tokenizers are loaded into memory.
pub fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
fn json_options(
tokenizer_name: &str,
with_positions: bool,
stored: bool,
) -> JsonObjectOptions {
let index_option = if with_positions {
IndexRecordOption::WithFreqsAndPositions
} else {
IndexRecordOption::Basic
};
let indexing = TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(index_option);
let mut options = JsonObjectOptions::default().set_indexing_options(indexing);
if stored {
options = options.set_stored();
}
options
}
/// Registers all required tokenizers with the index.
pub fn register_tokenizers(index: &Index) -> tantivy::Result<()> {
let tokenizer_manager = index.tokenizers();
// TOKENIZER for `prefix_edge`: Edge N-gram (1-4 chars)
let edge_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 4, true)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_edge", edge_tokenizer);
// TOKENIZER for `prefix_full`: Simple word tokenizer
let full_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_full", full_tokenizer);
// NGRAM TOKENIZER: For substring matching.
let ngram_tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_ngram", ngram_tokenizer);
tokenizer_manager.register(TOK_WORD, word_analyzer());
tokenizer_manager.register(TOK_NGRAM, ngram_analyzer()?);
tokenizer_manager.register(TOK_EXACT, exact_analyzer());
Ok(())
}
fn word_analyzer() -> TextAnalyzer {
TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(80))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build()
}
fn ngram_analyzer() -> tantivy::Result<TextAnalyzer> {
Ok(TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
.filter(RemoveLongFilter::limit(80))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build())
}
fn exact_analyzer() -> TextAnalyzer {
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build()
}
/// Tokenizes text the same way `data_word` is indexed.
pub fn tokenize_word(text: &str) -> Vec<String> {
tokenize_with(word_analyzer(), text)
}
/// Tokenizes text the same way `data_ngram` is indexed.
pub fn tokenize_ngram(text: &str) -> Vec<String> {
match ngram_analyzer() {
Ok(analyzer) => tokenize_with(analyzer, text),
Err(_) => Vec::new(),
}
}
fn tokenize_with(mut analyzer: TextAnalyzer, text: &str) -> Vec<String> {
let mut stream = analyzer.token_stream(text);
let mut out = Vec::new();
while let Some(token) = stream.next() {
out.push(token.text.clone());
}
out
}
/// Builds a term scoped to a specific JSON path within a JSON field.
pub fn json_path_term(field: Field, column: &str, text: &str) -> Term {
let mut term = Term::from_field_json_path(field, column, false);
term.append_type_and_str(text);
term
}
/// Returns all required schema fields or fails loudly on mismatch.
pub struct SchemaFields {
pub pg_id: Field,
pub table_name: Field,
pub row_key: Field,
pub data_word: Field,
pub data_ngram: Field,
pub data_exact: Field,
}
impl SchemaFields {
pub fn from(schema: &Schema) -> tantivy::Result<Self> {
Ok(Self {
pg_id: get_field(schema, F_PG_ID)?,
table_name: get_field(schema, F_TABLE_NAME)?,
row_key: get_field(schema, F_ROW_KEY)?,
data_word: get_field(schema, F_DATA_WORD)?,
data_ngram: get_field(schema, F_DATA_NGRAM)?,
data_exact: get_field(schema, F_DATA_EXACT)?,
})
}
}
fn get_field(schema: &Schema, name: &str) -> tantivy::Result<Field> {
schema.get_field(name).map_err(|e| {
tantivy::TantivyError::SchemaError(format!("schema is missing field '{name}': {e}"))
})
}