258 lines
9.4 KiB
Rust
258 lines
9.4 KiB
Rust
// search/src/lib.rs
|
|
|
|
use std::path::Path;
|
|
use tantivy::collector::TopDocs;
|
|
use tantivy::query::{
|
|
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser,
|
|
TermQuery,
|
|
};
|
|
use tantivy::schema::IndexRecordOption;
|
|
use tantivy::{Index, TantivyDocument, Term};
|
|
use tonic::{Request, Response, Status};
|
|
|
|
use common::proto::multieko2::search::{
|
|
search_response::Hit, SearchRequest, SearchResponse,
|
|
};
|
|
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
|
|
use common::proto::multieko2::search::searcher_server::Searcher;
|
|
use tantivy::schema::Value;
|
|
|
|
pub struct SearcherService;
|
|
|
|
// Normalize diacritics in queries
|
|
fn normalize_slovak_text(text: &str) -> String {
|
|
text.chars()
|
|
.map(|c| match c {
|
|
'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
|
|
'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
|
|
'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
|
|
'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
|
|
'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
|
|
'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
|
|
'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
|
|
'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
|
|
'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
|
|
'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
|
|
'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
|
|
'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
|
|
'č' => 'c',
|
|
'Č' => 'C',
|
|
'ď' => 'd',
|
|
'Ď' => 'D',
|
|
'ľ' => 'l',
|
|
'Ľ' => 'L',
|
|
'ň' => 'n',
|
|
'Ň' => 'N',
|
|
'ř' => 'r',
|
|
'Ř' => 'R',
|
|
'š' => 's',
|
|
'Š' => 'S',
|
|
'ť' => 't',
|
|
'Ť' => 'T',
|
|
'ž' => 'z',
|
|
'Ž' => 'Z',
|
|
_ => c,
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
#[tonic::async_trait]
|
|
impl Searcher for SearcherService {
|
|
async fn search_table(
|
|
&self,
|
|
request: Request<SearchRequest>,
|
|
) -> Result<Response<SearchResponse>, Status> {
|
|
let req = request.into_inner();
|
|
let table_name = req.table_name;
|
|
let query_str = req.query;
|
|
|
|
if query_str.trim().is_empty() {
|
|
return Err(Status::invalid_argument("Query cannot be empty"));
|
|
}
|
|
|
|
let index_path = Path::new("./tantivy_indexes").join(&table_name);
|
|
if !index_path.exists() {
|
|
return Err(Status::not_found(format!(
|
|
"No search index found for table '{}'",
|
|
table_name
|
|
)));
|
|
}
|
|
|
|
let index = Index::open_in_dir(&index_path)
|
|
.map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
|
|
|
|
register_slovak_tokenizers(&index).map_err(|e| {
|
|
Status::internal(format!("Failed to register Slovak tokenizers: {}", e))
|
|
})?;
|
|
|
|
let reader = index.reader().map_err(|e| {
|
|
Status::internal(format!("Failed to create index reader: {}", e))
|
|
})?;
|
|
let searcher = reader.searcher();
|
|
let schema = index.schema();
|
|
|
|
let prefix_edge_field = schema.get_field("prefix_edge").map_err(|_| {
|
|
Status::internal("Schema is missing the 'prefix_edge' field.")
|
|
})?;
|
|
let prefix_full_field = schema.get_field("prefix_full").map_err(|_| {
|
|
Status::internal("Schema is missing the 'prefix_full' field.")
|
|
})?;
|
|
let text_ngram_field = schema.get_field("text_ngram").map_err(|_| {
|
|
Status::internal("Schema is missing the 'text_ngram' field.")
|
|
})?;
|
|
let pg_id_field = schema.get_field("pg_id").map_err(|_| {
|
|
Status::internal("Schema is missing the 'pg_id' field.")
|
|
})?;
|
|
|
|
let normalized_query = normalize_slovak_text(&query_str);
|
|
let words: Vec<&str> = normalized_query.split_whitespace().collect();
|
|
|
|
if words.is_empty() {
|
|
return Ok(Response::new(SearchResponse { hits: vec![] }));
|
|
}
|
|
|
|
let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
|
|
|
// ===============================
|
|
// LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
|
|
// ===============================
|
|
{
|
|
let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
|
for word in &words {
|
|
let edge_term =
|
|
Term::from_field_text(prefix_edge_field, word);
|
|
let full_term =
|
|
Term::from_field_text(prefix_full_field, word);
|
|
|
|
let per_word_query = BooleanQuery::new(vec![
|
|
(
|
|
Occur::Should,
|
|
Box::new(TermQuery::new(
|
|
edge_term,
|
|
IndexRecordOption::Basic,
|
|
)),
|
|
),
|
|
(
|
|
Occur::Should,
|
|
Box::new(TermQuery::new(
|
|
full_term,
|
|
IndexRecordOption::Basic,
|
|
)),
|
|
),
|
|
]);
|
|
must_clauses.push((Occur::Must, Box::new(per_word_query) as Box<dyn Query>));
|
|
}
|
|
|
|
if !must_clauses.is_empty() {
|
|
let prefix_query = BooleanQuery::new(must_clauses);
|
|
let boosted_query =
|
|
BoostQuery::new(Box::new(prefix_query), 4.0);
|
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
|
}
|
|
}
|
|
|
|
// ===============================
|
|
// LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0)
|
|
// ===============================
|
|
{
|
|
let last_word = words.last().unwrap();
|
|
let fuzzy_term =
|
|
Term::from_field_text(prefix_full_field, last_word);
|
|
let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true);
|
|
let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0);
|
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
|
}
|
|
|
|
// ===============================
|
|
// LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0)
|
|
// ===============================
|
|
if words.len() > 1 {
|
|
let slop_parser =
|
|
QueryParser::for_index(&index, vec![prefix_full_field]);
|
|
let slop_query_str = format!("\"{}\"~3", normalized_query);
|
|
if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) {
|
|
let boosted_query = BoostQuery::new(slop_query, 2.0);
|
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
|
}
|
|
}
|
|
|
|
// ===============================
|
|
// LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0)
|
|
// ===============================
|
|
{
|
|
let ngram_parser =
|
|
QueryParser::for_index(&index, vec![text_ngram_field]);
|
|
if let Ok(ngram_query) =
|
|
ngram_parser.parse_query(&normalized_query)
|
|
{
|
|
let boosted_query = BoostQuery::new(ngram_query, 1.0);
|
|
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
|
}
|
|
}
|
|
|
|
let master_query = BooleanQuery::new(query_layers);
|
|
|
|
let top_docs = searcher
|
|
.search(&master_query, &TopDocs::with_limit(100))
|
|
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
|
|
|
|
let mut hits = Vec::new();
|
|
for (score, doc_address) in top_docs {
|
|
let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| {
|
|
Status::internal(format!("Failed to retrieve document: {}", e))
|
|
})?;
|
|
|
|
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
|
|
if let Some(pg_id) = pg_id_value.as_u64() {
|
|
hits.push(Hit {
|
|
id: pg_id as i64,
|
|
score,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
let response = SearchResponse { hits };
|
|
Ok(Response::new(response))
|
|
}
|
|
}
|
|
|
|
/// This function is now an exact mirror of the one in `server/src/search_schema.rs`
|
|
fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
|
|
use tantivy::tokenizer::*;
|
|
|
|
let tokenizer_manager = index.tokenizers();
|
|
|
|
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
|
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
|
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
|
|
.filter(RemoveLongFilter::limit(40))
|
|
.filter(LowerCaser)
|
|
.filter(AsciiFoldingFilter)
|
|
.build();
|
|
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
|
}
|
|
|
|
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
|
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
|
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
|
.filter(RemoveLongFilter::limit(40))
|
|
.filter(LowerCaser)
|
|
.filter(AsciiFoldingFilter)
|
|
.build();
|
|
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
|
}
|
|
|
|
// NGRAM TOKENIZER: For substring matching.
|
|
if tokenizer_manager.get("slovak_ngram").is_none() {
|
|
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
|
.filter(RemoveLongFilter::limit(40))
|
|
.filter(LowerCaser)
|
|
.filter(AsciiFoldingFilter)
|
|
.build();
|
|
tokenizer_manager.register("slovak_ngram", tokenizer);
|
|
}
|
|
|
|
Ok(())
|
|
}
|