better search but still has some flaws. It at least works, even tho its not perfect. Needs more testing, but im pretty happy with it rn, keeping it this way

This commit is contained in:
filipriec
2025-06-10 00:22:31 +02:00
parent 4760f42589
commit 350c522d19
3 changed files with 202 additions and 91 deletions

View File

@@ -1,18 +1,20 @@
// src/lib.rs
// search/src/lib.rs
use std::path::Path;
use tantivy::collector::TopDocs;
use tantivy::query::{BooleanQuery, Occur, Query, TermQuery};
use tantivy::query::{
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser,
TermQuery,
};
use tantivy::schema::IndexRecordOption;
use tantivy::tokenizer::Tokenizer;
use tantivy::{Index, TantivyDocument, Term};
use tonic::{Request, Response, Status};
use common::proto::multieko2::search::{
search_response::Hit, SearchRequest, SearchResponse,
};
use common::proto::multieko2::search::searcher_server::Searcher;
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
use common::proto::multieko2::search::searcher_server::Searcher;
use tantivy::schema::Value;
pub struct SearcherService;
@@ -79,8 +81,8 @@ impl Searcher for SearcherService {
let index = Index::open_in_dir(&index_path)
.map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
register_slovak_tokenizer(&index).map_err(|e| {
Status::internal(format!("Failed to register Slovak tokenizer: {}", e))
register_slovak_tokenizers(&index).map_err(|e| {
Status::internal(format!("Failed to register Slovak tokenizers: {}", e))
})?;
let reader = index.reader().map_err(|e| {
@@ -89,49 +91,109 @@ impl Searcher for SearcherService {
let searcher = reader.searcher();
let schema = index.schema();
let text_sk_field = schema.get_field("text_sk").map_err(|_| {
Status::internal("Schema is missing the 'text_sk' field.")
let prefix_edge_field = schema.get_field("prefix_edge").map_err(|_| {
Status::internal("Schema is missing the 'prefix_edge' field.")
})?;
let prefix_full_field = schema.get_field("prefix_full").map_err(|_| {
Status::internal("Schema is missing the 'prefix_full' field.")
})?;
let text_ngram_field = schema.get_field("text_ngram").map_err(|_| {
Status::internal("Schema is missing the 'text_ngram' field.")
})?;
let pg_id_field = schema.get_field("pg_id").map_err(|_| {
Status::internal("Schema is missing the 'pg_id' field.")
})?;
// --- FINAL, ROBUST QUERY LOGIC ---
// 1. Get the exact tokenizer used for indexing the target field.
let mut tokenizer = index
.tokenizer_for_field(text_sk_field)
.map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?;
// 2. Manually tokenize the user's normalized query string.
// CORRECTED: Store the normalized string in a variable to extend its lifetime.
let normalized_query = normalize_slovak_text(&query_str);
let mut token_stream = tokenizer.token_stream(&normalized_query);
let words: Vec<&str> = normalized_query.split_whitespace().collect();
let mut terms = Vec::new();
while let Some(token) = token_stream.next() {
terms.push(Term::from_field_text(text_sk_field, &token.text));
}
if terms.is_empty() {
if words.is_empty() {
return Ok(Response::new(SearchResponse { hits: vec![] }));
}
// 3. Create a TermQuery for each token.
let term_queries: Vec<(Occur, Box<dyn Query>)> = terms
.into_iter()
.map(|term| {
let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
(Occur::Must, Box::new(term_query) as Box<dyn Query>)
})
.collect();
let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
// 4. Combine them into a BooleanQuery.
let final_query = BooleanQuery::new(term_queries);
// --- END OF LOGIC ---
// ===============================
// LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
// ===============================
{
let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
for word in &words {
let edge_term =
Term::from_field_text(prefix_edge_field, word);
let full_term =
Term::from_field_text(prefix_full_field, word);
let per_word_query = BooleanQuery::new(vec![
(
Occur::Should,
Box::new(TermQuery::new(
edge_term,
IndexRecordOption::Basic,
)),
),
(
Occur::Should,
Box::new(TermQuery::new(
full_term,
IndexRecordOption::Basic,
)),
),
]);
must_clauses.push((Occur::Must, Box::new(per_word_query) as Box<dyn Query>));
}
if !must_clauses.is_empty() {
let prefix_query = BooleanQuery::new(must_clauses);
let boosted_query =
BoostQuery::new(Box::new(prefix_query), 4.0);
query_layers.push((Occur::Should, Box::new(boosted_query)));
}
}
// ===============================
// LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0)
// ===============================
{
let last_word = words.last().unwrap();
let fuzzy_term =
Term::from_field_text(prefix_full_field, last_word);
let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true);
let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0);
query_layers.push((Occur::Should, Box::new(boosted_query)));
}
// ===============================
// LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0)
// ===============================
if words.len() > 1 {
let slop_parser =
QueryParser::for_index(&index, vec![prefix_full_field]);
let slop_query_str = format!("\"{}\"~3", normalized_query);
if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) {
let boosted_query = BoostQuery::new(slop_query, 2.0);
query_layers.push((Occur::Should, Box::new(boosted_query)));
}
}
// ===============================
// LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0)
// ===============================
{
let ngram_parser =
QueryParser::for_index(&index, vec![text_ngram_field]);
if let Ok(ngram_query) =
ngram_parser.parse_query(&normalized_query)
{
let boosted_query = BoostQuery::new(ngram_query, 1.0);
query_layers.push((Occur::Should, Box::new(boosted_query)));
}
}
let master_query = BooleanQuery::new(query_layers);
let top_docs = searcher
.search(&final_query, &TopDocs::with_limit(100))
.search(&master_query, &TopDocs::with_limit(100))
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
let mut hits = Vec::new();
@@ -142,7 +204,10 @@ impl Searcher for SearcherService {
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
if let Some(pg_id) = pg_id_value.as_u64() {
hits.push(Hit { id: pg_id as i64, score });
hits.push(Hit {
id: pg_id as i64,
score,
});
}
}
}
@@ -152,22 +217,40 @@ impl Searcher for SearcherService {
}
}
/// Registers the Slovak ngram tokenizer
fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
/// This function is now an exact mirror of the one in `server/src/search_schema.rs`
fn register_slovak_tokenizers(index: &Index) -> tantivy::Result<()> {
use tantivy::tokenizer::*;
let tokenizer_manager = index.tokenizers();
if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak"
let slovak_ngram_tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(1, 15, true)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
}
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak"
// TOKENIZER for `prefix_full`: Simple word tokenizer
if tokenizer_manager.get("slovak_prefix_full").is_none() {
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_full", tokenizer);
}
// NGRAM TOKENIZER: For substring matching.
if tokenizer_manager.get("slovak_ngram").is_none() {
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(3, 3, false)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_ngram", tokenizer);
}
Ok(())

View File

@@ -1,11 +1,11 @@
// src/indexer.rs
// server/src/indexer.rs
use std::path::Path;
use sqlx::{PgPool, Row};
use tantivy::schema::{Schema, Term};
use tantivy::{doc, Index, IndexWriter};
use tantivy::schema::Term;
use tantivy::{doc, IndexWriter};
use tokio::sync::mpsc::Receiver;
use tracing::{error, info, warn};
use tantivy::schema::Schema;
use crate::search_schema;
const INDEX_DIR: &str = "./tantivy_indexes";
@@ -49,44 +49,39 @@ async fn handle_add_or_update(
pool: &PgPool,
data: IndexCommandData,
) -> anyhow::Result<()> {
// 1. Fetch the full row data from PostgreSQL
let qualified_table = format!("gen.\"{}\"", data.table_name);
let query_str = format!(
"SELECT to_jsonb(t) AS data FROM {} t WHERE id = $1",
qualified_table
);
let row = sqlx::query(&query_str)
.bind(data.row_id)
.fetch_one(pool)
.await?;
let json_data: serde_json::Value = row.try_get("data")?;
// 2. Extract all text content for Slovak processing
let slovak_text = extract_text_content(&json_data);
// 3. Open the index and write the document
let (mut writer, schema) = get_index_writer(&data.table_name)?;
let pg_id_field = schema.get_field("pg_id").unwrap();
let text_sk_field = schema.get_field("text_sk").unwrap();
let prefix_edge_field = schema.get_field("prefix_edge").unwrap();
let prefix_full_field = schema.get_field("prefix_full").unwrap();
let text_ngram_field = schema.get_field("text_ngram").unwrap();
// First, delete any existing document with this ID to handle updates
let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
writer.delete_term(id_term);
// Add the new document
writer.add_document(doc!(
pg_id_field => data.row_id as u64,
text_sk_field => slovak_text
prefix_edge_field => slovak_text.clone(),
prefix_full_field => slovak_text.clone(),
text_ngram_field => slovak_text
))?;
// 4. Commit changes
writer.commit()?;
info!(
"Successfully indexed Slovak document id:{} for table:{}",
"Successfully indexed document id:{} for table:{}",
data.row_id, data.table_name
);
Ok(())
}
@@ -123,7 +118,7 @@ fn get_index_writer(
/// Extract all text content from a JSON object for indexing
fn extract_text_content(json_data: &serde_json::Value) -> String {
let mut full_text = String::new();
if let Some(obj) = json_data.as_object() {
for value in obj.values() {
match value {
@@ -135,11 +130,10 @@ fn extract_text_content(json_data: &serde_json::Value) -> String {
full_text.push_str(&n.to_string());
full_text.push(' ');
}
// We could recursively handle nested objects if needed
_ => {}
}
}
}
full_text.trim().to_string()
}

View File

@@ -5,59 +5,93 @@ use tantivy::tokenizer::*;
use tantivy::Index;
use std::path::Path;
/// Creates a Tantivy schema optimized for Slovak ngram search
/// Creates a hybrid Slovak search schema with optimized prefix fields.
pub fn create_search_schema() -> Schema {
let mut schema_builder = Schema::builder();
// ID field to link back to PostgreSQL
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
// Slovak text field with ngram tokenizer for search-as-you-type
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak") // KEEP THE SAME NAME
// FIELD 1: For prefixes (1-15 chars).
let short_prefix_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_edge")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
let short_prefix_options = TextOptions::default()
.set_indexing_options(short_prefix_indexing)
.set_stored();
schema_builder.add_text_field("prefix_edge", short_prefix_options);
schema_builder.add_text_field("text_sk", text_options);
// FIELD 2: For the full word.
let full_word_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_prefix_full")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let full_word_options = TextOptions::default()
.set_indexing_options(full_word_indexing)
.set_stored();
schema_builder.add_text_field("prefix_full", full_word_options);
// NGRAM FIELD: For substring matching.
let ngram_field_indexing = TextFieldIndexing::default()
.set_tokenizer("slovak_ngram")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let ngram_options = TextOptions::default()
.set_indexing_options(ngram_field_indexing)
.set_stored();
schema_builder.add_text_field("text_ngram", ngram_options);
schema_builder.build()
}
/// Registers the Slovak ngram tokenizer with the index
/// Registers all necessary Slovak tokenizers with the index.
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
let tokenizer_manager = index.tokenizers();
// Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
let slovak_ngram_tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
)
.filter(RemoveLongFilter::limit(40)) // Remove very long tokens
.filter(LowerCaser) // Convert to lowercase
.filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
.build();
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
// YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit.
let tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(1, 15, true)?
)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
}
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
// TOKENIZER for `prefix_full`: Simple word tokenizer
if tokenizer_manager.get("slovak_prefix_full").is_none() {
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_prefix_full", tokenizer);
}
// NGRAM TOKENIZER: For substring matching.
if tokenizer_manager.get("slovak_ngram").is_none() {
let tokenizer = TextAnalyzer::builder(
NgramTokenizer::new(3, 3, false)?
)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build();
tokenizer_manager.register("slovak_ngram", tokenizer);
}
Ok(())
}
/// Gets or creates an index for a table with proper Slovak ngram processing
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
let index_path = Path::new("./tantivy_indexes").join(table_name);
std::fs::create_dir_all(&index_path)?;
let index = if index_path.join("meta.json").exists() {
Index::open_in_dir(&index_path)?
} else {
let schema = create_search_schema();
Index::create_in_dir(&index_path, schema)?
};
// Always register the tokenizer when opening
register_slovak_tokenizer(&index)?;
Ok(index)
}