slovak language tokenized search
This commit is contained in:
@@ -1,7 +1,11 @@
|
||||
// src/lib.rs
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::{collector::TopDocs, query::QueryParser, Index, TantivyDocument};
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::{BooleanQuery, Occur, Query, TermQuery};
|
||||
use tantivy::schema::IndexRecordOption;
|
||||
use tantivy::tokenizer::Tokenizer;
|
||||
use tantivy::{Index, TantivyDocument, Term};
|
||||
use tonic::{Request, Response, Status};
|
||||
|
||||
use common::proto::multieko2::search::{
|
||||
@@ -13,6 +17,43 @@ use tantivy::schema::Value;
|
||||
|
||||
pub struct SearcherService;
|
||||
|
||||
// Normalize diacritics in queries
|
||||
fn normalize_slovak_text(text: &str) -> String {
|
||||
text.chars()
|
||||
.map(|c| match c {
|
||||
'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
|
||||
'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
|
||||
'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
|
||||
'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
|
||||
'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
|
||||
'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
|
||||
'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
|
||||
'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
|
||||
'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
|
||||
'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
|
||||
'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
|
||||
'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
|
||||
'č' => 'c',
|
||||
'Č' => 'C',
|
||||
'ď' => 'd',
|
||||
'Ď' => 'D',
|
||||
'ľ' => 'l',
|
||||
'Ľ' => 'L',
|
||||
'ň' => 'n',
|
||||
'Ň' => 'N',
|
||||
'ř' => 'r',
|
||||
'Ř' => 'R',
|
||||
'š' => 's',
|
||||
'Š' => 'S',
|
||||
'ť' => 't',
|
||||
'Ť' => 'T',
|
||||
'ž' => 'z',
|
||||
'Ž' => 'Z',
|
||||
_ => c,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl Searcher for SearcherService {
|
||||
async fn search_table(
|
||||
@@ -27,9 +68,7 @@ impl Searcher for SearcherService {
|
||||
return Err(Status::invalid_argument("Query cannot be empty"));
|
||||
}
|
||||
|
||||
// Open the index for this table
|
||||
let index_path = Path::new("./tantivy_indexes").join(&table_name);
|
||||
|
||||
if !index_path.exists() {
|
||||
return Err(Status::not_found(format!(
|
||||
"No search index found for table '{}'",
|
||||
@@ -37,68 +76,73 @@ impl Searcher for SearcherService {
|
||||
)));
|
||||
}
|
||||
|
||||
// Open the index
|
||||
let index = Index::open_in_dir(&index_path).map_err(|e| {
|
||||
Status::internal(format!("Failed to open index: {}", e))
|
||||
let index = Index::open_in_dir(&index_path)
|
||||
.map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
|
||||
|
||||
register_slovak_tokenizer(&index).map_err(|e| {
|
||||
Status::internal(format!("Failed to register Slovak tokenizer: {}", e))
|
||||
})?;
|
||||
|
||||
// Create reader and searcher
|
||||
let reader = index.reader().map_err(|e| {
|
||||
Status::internal(format!("Failed to create index reader: {}", e))
|
||||
})?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let schema = index.schema();
|
||||
|
||||
// Get the fields we need
|
||||
let all_text_field = match schema.get_field("all_text") {
|
||||
Ok(field) => field,
|
||||
Err(_) => {
|
||||
return Err(Status::internal(
|
||||
"Schema is missing the 'all_text' field.",
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
let pg_id_field = match schema.get_field("pg_id") {
|
||||
Ok(field) => field,
|
||||
Err(_) => {
|
||||
return Err(Status::internal(
|
||||
"Schema is missing the 'pg_id' field.",
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
// Parse the query
|
||||
let query_parser =
|
||||
QueryParser::for_index(&index, vec![all_text_field]);
|
||||
let query = query_parser.parse_query(&query_str).map_err(|e| {
|
||||
Status::invalid_argument(format!("Invalid query: {}", e))
|
||||
let text_sk_field = schema.get_field("text_sk").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'text_sk' field.")
|
||||
})?;
|
||||
let pg_id_field = schema.get_field("pg_id").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'pg_id' field.")
|
||||
})?;
|
||||
|
||||
// Perform the search
|
||||
// --- FINAL, ROBUST QUERY LOGIC ---
|
||||
|
||||
// 1. Get the exact tokenizer used for indexing the target field.
|
||||
let mut tokenizer = index
|
||||
.tokenizer_for_field(text_sk_field)
|
||||
.map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?;
|
||||
|
||||
// 2. Manually tokenize the user's normalized query string.
|
||||
// CORRECTED: Store the normalized string in a variable to extend its lifetime.
|
||||
let normalized_query = normalize_slovak_text(&query_str);
|
||||
let mut token_stream = tokenizer.token_stream(&normalized_query);
|
||||
|
||||
let mut terms = Vec::new();
|
||||
while let Some(token) = token_stream.next() {
|
||||
terms.push(Term::from_field_text(text_sk_field, &token.text));
|
||||
}
|
||||
|
||||
if terms.is_empty() {
|
||||
return Ok(Response::new(SearchResponse { hits: vec![] }));
|
||||
}
|
||||
|
||||
// 3. Create a TermQuery for each token.
|
||||
let term_queries: Vec<(Occur, Box<dyn Query>)> = terms
|
||||
.into_iter()
|
||||
.map(|term| {
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
||||
(Occur::Must, Box::new(term_query) as Box<dyn Query>)
|
||||
})
|
||||
.collect();
|
||||
|
||||
// 4. Combine them into a BooleanQuery.
|
||||
let final_query = BooleanQuery::new(term_queries);
|
||||
// --- END OF LOGIC ---
|
||||
|
||||
let top_docs = searcher
|
||||
.search(&query, &TopDocs::with_limit(100))
|
||||
.search(&final_query, &TopDocs::with_limit(100))
|
||||
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
|
||||
|
||||
// Convert results to our response format
|
||||
let mut hits = Vec::new();
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc: TantivyDocument = searcher.doc(doc_address).map_err(
|
||||
|e| {
|
||||
Status::internal(format!(
|
||||
"Failed to retrieve document: {}",
|
||||
e
|
||||
))
|
||||
},
|
||||
)?;
|
||||
let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| {
|
||||
Status::internal(format!("Failed to retrieve document: {}", e))
|
||||
})?;
|
||||
|
||||
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
|
||||
if let Some(pg_id) = pg_id_value.as_u64() {
|
||||
hits.push(Hit {
|
||||
id: pg_id as i64,
|
||||
score,
|
||||
});
|
||||
hits.push(Hit { id: pg_id as i64, score });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -107,3 +151,24 @@ impl Searcher for SearcherService {
|
||||
Ok(Response::new(response))
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers the Slovak ngram tokenizer
|
||||
fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
let tokenizer_manager = index.tokenizers();
|
||||
|
||||
if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak"
|
||||
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
||||
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
||||
)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
|
||||
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak"
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user