better search but still has some flaws. It at least works, even tho its not perfect. Needs more testing, but im pretty happy with it rn, keeping it this way
This commit is contained in:
@@ -1,18 +1,20 @@
|
||||
// src/lib.rs
|
||||
// search/src/lib.rs
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::{BooleanQuery, Occur, Query, TermQuery};
|
||||
use tantivy::query::{
|
||||
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser,
|
||||
TermQuery,
|
||||
};
|
||||
use tantivy::schema::IndexRecordOption;
|
||||
use tantivy::tokenizer::Tokenizer;
|
||||
use tantivy::{Index, TantivyDocument, Term};
|
||||
use tonic::{Request, Response, Status};
|
||||
|
||||
use common::proto::multieko2::search::{
|
||||
search_response::Hit, SearchRequest, SearchResponse,
|
||||
};
|
||||
use common::proto::multieko2::search::searcher_server::Searcher;
|
||||
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
|
||||
use common::proto::multieko2::search::searcher_server::Searcher;
|
||||
use tantivy::schema::Value;
|
||||
|
||||
pub struct SearcherService;
|
||||
@@ -79,8 +81,8 @@ impl Searcher for SearcherService {
|
||||
let index = Index::open_in_dir(&index_path)
|
||||
.map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
|
||||
|
||||
register_slovak_tokenizer(&index).map_err(|e| {
|
||||
Status::internal(format!("Failed to register Slovak tokenizer: {}", e))
|
||||
register_slovak_tokenizers(&index).map_err(|e| {
|
||||
Status::internal(format!("Failed to register Slovak tokenizers: {}", e))
|
||||
})?;
|
||||
|
||||
let reader = index.reader().map_err(|e| {
|
||||
@@ -89,49 +91,109 @@ impl Searcher for SearcherService {
|
||||
let searcher = reader.searcher();
|
||||
let schema = index.schema();
|
||||
|
||||
let text_sk_field = schema.get_field("text_sk").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'text_sk' field.")
|
||||
let prefix_edge_field = schema.get_field("prefix_edge").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'prefix_edge' field.")
|
||||
})?;
|
||||
let prefix_full_field = schema.get_field("prefix_full").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'prefix_full' field.")
|
||||
})?;
|
||||
let text_ngram_field = schema.get_field("text_ngram").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'text_ngram' field.")
|
||||
})?;
|
||||
let pg_id_field = schema.get_field("pg_id").map_err(|_| {
|
||||
Status::internal("Schema is missing the 'pg_id' field.")
|
||||
})?;
|
||||
|
||||
// --- FINAL, ROBUST QUERY LOGIC ---
|
||||
|
||||
// 1. Get the exact tokenizer used for indexing the target field.
|
||||
let mut tokenizer = index
|
||||
.tokenizer_for_field(text_sk_field)
|
||||
.map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?;
|
||||
|
||||
// 2. Manually tokenize the user's normalized query string.
|
||||
// CORRECTED: Store the normalized string in a variable to extend its lifetime.
|
||||
let normalized_query = normalize_slovak_text(&query_str);
|
||||
let mut token_stream = tokenizer.token_stream(&normalized_query);
|
||||
let words: Vec<&str> = normalized_query.split_whitespace().collect();
|
||||
|
||||
let mut terms = Vec::new();
|
||||
while let Some(token) = token_stream.next() {
|
||||
terms.push(Term::from_field_text(text_sk_field, &token.text));
|
||||
}
|
||||
|
||||
if terms.is_empty() {
|
||||
if words.is_empty() {
|
||||
return Ok(Response::new(SearchResponse { hits: vec![] }));
|
||||
}
|
||||
|
||||
// 3. Create a TermQuery for each token.
|
||||
let term_queries: Vec<(Occur, Box<dyn Query>)> = terms
|
||||
.into_iter()
|
||||
.map(|term| {
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
|
||||
(Occur::Must, Box::new(term_query) as Box<dyn Query>)
|
||||
})
|
||||
.collect();
|
||||
let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
|
||||
// 4. Combine them into a BooleanQuery.
|
||||
let final_query = BooleanQuery::new(term_queries);
|
||||
// --- END OF LOGIC ---
|
||||
// ===============================
|
||||
// LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
|
||||
// ===============================
|
||||
{
|
||||
let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
|
||||
for word in &words {
|
||||
let edge_term =
|
||||
Term::from_field_text(prefix_edge_field, word);
|
||||
let full_term =
|
||||
Term::from_field_text(prefix_full_field, word);
|
||||
|
||||
let per_word_query = BooleanQuery::new(vec![
|
||||
(
|
||||
Occur::Should,
|
||||
Box::new(TermQuery::new(
|
||||
edge_term,
|
||||
IndexRecordOption::Basic,
|
||||
)),
|
||||
),
|
||||
(
|
||||
Occur::Should,
|
||||
Box::new(TermQuery::new(
|
||||
full_term,
|
||||
IndexRecordOption::Basic,
|
||||
)),
|
||||
),
|
||||
]);
|
||||
must_clauses.push((Occur::Must, Box::new(per_word_query) as Box<dyn Query>));
|
||||
}
|
||||
|
||||
if !must_clauses.is_empty() {
|
||||
let prefix_query = BooleanQuery::new(must_clauses);
|
||||
let boosted_query =
|
||||
BoostQuery::new(Box::new(prefix_query), 4.0);
|
||||
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||
}
|
||||
}
|
||||
|
||||
// ===============================
|
||||
// LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0)
|
||||
// ===============================
|
||||
{
|
||||
let last_word = words.last().unwrap();
|
||||
let fuzzy_term =
|
||||
Term::from_field_text(prefix_full_field, last_word);
|
||||
let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true);
|
||||
let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0);
|
||||
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||
}
|
||||
|
||||
// ===============================
|
||||
// LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0)
|
||||
// ===============================
|
||||
if words.len() > 1 {
|
||||
let slop_parser =
|
||||
QueryParser::for_index(&index, vec![prefix_full_field]);
|
||||
let slop_query_str = format!("\"{}\"~3", normalized_query);
|
||||
if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) {
|
||||
let boosted_query = BoostQuery::new(slop_query, 2.0);
|
||||
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||
}
|
||||
}
|
||||
|
||||
// ===============================
|
||||
// LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0)
|
||||
// ===============================
|
||||
{
|
||||
let ngram_parser =
|
||||
QueryParser::for_index(&index, vec![text_ngram_field]);
|
||||
if let Ok(ngram_query) =
|
||||
ngram_parser.parse_query(&normalized_query)
|
||||
{
|
||||
let boosted_query = BoostQuery::new(ngram_query, 1.0);
|
||||
query_layers.push((Occur::Should, Box::new(boosted_query)));
|
||||
}
|
||||
}
|
||||
|
||||
let master_query = BooleanQuery::new(query_layers);
|
||||
|
||||
let top_docs = searcher
|
||||
.search(&final_query, &TopDocs::with_limit(100))
|
||||
.search(&master_query, &TopDocs::with_limit(100))
|
||||
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
|
||||
|
||||
let mut hits = Vec::new();
|
||||
@@ -142,7 +204,10 @@ impl Searcher for SearcherService {
|
||||
|
||||
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
|
||||
if let Some(pg_id) = pg_id_value.as_u64() {
|
||||
hits.push(Hit { id: pg_id as i64, score });
|
||||
hits.push(Hit {
|
||||
id: pg_id as i64,
|
||||
score,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -152,22 +217,40 @@ impl Searcher for SearcherService {
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers the Slovak ngram tokenizer
|
||||
fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
||||
/// This function is now an exact mirror of the one in `server/src/search_schema.rs`
|
||||
fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
let tokenizer_manager = index.tokenizers();
|
||||
|
||||
if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak"
|
||||
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
||||
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
||||
)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
||||
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
||||
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
||||
}
|
||||
|
||||
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak"
|
||||
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
||||
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
||||
}
|
||||
|
||||
// NGRAM TOKENIZER: For substring matching.
|
||||
if tokenizer_manager.get("slovak_ngram").is_none() {
|
||||
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_ngram", tokenizer);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
Reference in New Issue
Block a user