slovak language tokenized search

2025-06-09 16:36:18 +02:00
parent 50d15e321f
commit 4760f42589
7 changed files with 210 additions and 157 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2843,6 +2843,7 @@ dependencies = [
 "prost",
 "regex",
 "rstest",
+ "rust-stemmers",
 "search",
 "serde",
 "serde_json",
--- a/search/src/lib.rs
+++ b/search/src/lib.rs
@@ -1,7 +1,11 @@
 // src/lib.rs

 use std::path::Path;
-use tantivy::{collector::TopDocs, query::QueryParser, Index, TantivyDocument};
+use tantivy::collector::TopDocs;
+use tantivy::query::{BooleanQuery, Occur, Query, TermQuery};
+use tantivy::schema::IndexRecordOption;
+use tantivy::tokenizer::Tokenizer;
+use tantivy::{Index, TantivyDocument, Term};
 use tonic::{Request, Response, Status};

 use common::proto::multieko2::search::{
@@ -13,6 +17,43 @@ use tantivy::schema::Value;

 pub struct SearcherService;

+// Normalize diacritics in queries
+fn normalize_slovak_text(text: &str) -> String {
+    text.chars()
+        .map(|c| match c {
+            'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
+            'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
+            'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
+            'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
+            'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
+            'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
+            'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
+            'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
+            'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
+            'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
+            'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
+            'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
+            'č' => 'c',
+            'Č' => 'C',
+            'ď' => 'd',
+            'Ď' => 'D',
+            'ľ' => 'l',
+            'Ľ' => 'L',
+            'ň' => 'n',
+            'Ň' => 'N',
+            'ř' => 'r',
+            'Ř' => 'R',
+            'š' => 's',
+            'Š' => 'S',
+            'ť' => 't',
+            'Ť' => 'T',
+            'ž' => 'z',
+            'Ž' => 'Z',
+            _ => c,
+        })
+        .collect()
+}
+
 #[tonic::async_trait]
 impl Searcher for SearcherService {
    async fn search_table(
@@ -27,9 +68,7 @@ impl Searcher for SearcherService {
            return Err(Status::invalid_argument("Query cannot be empty"));
        }

-        // Open the index for this table
        let index_path = Path::new("./tantivy_indexes").join(&table_name);
-
        if !index_path.exists() {
            return Err(Status::not_found(format!(
                "No search index found for table '{}'",
@@ -37,68 +76,73 @@ impl Searcher for SearcherService {
            )));
        }

-        // Open the index
-        let index = Index::open_in_dir(&index_path).map_err(|e| {
-            Status::internal(format!("Failed to open index: {}", e))
+        let index = Index::open_in_dir(&index_path)
+            .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;
+
+        register_slovak_tokenizer(&index).map_err(|e| {
+            Status::internal(format!("Failed to register Slovak tokenizer: {}", e))
        })?;

-        // Create reader and searcher
        let reader = index.reader().map_err(|e| {
            Status::internal(format!("Failed to create index reader: {}", e))
        })?;
-
        let searcher = reader.searcher();
        let schema = index.schema();

-        // Get the fields we need
-        let all_text_field = match schema.get_field("all_text") {
-            Ok(field) => field,
-            Err(_) => {
-                return Err(Status::internal(
-                    "Schema is missing the 'all_text' field.",
-                ))
-            }
-        };
-
-        let pg_id_field = match schema.get_field("pg_id") {
-            Ok(field) => field,
-            Err(_) => {
-                return Err(Status::internal(
-                    "Schema is missing the 'pg_id' field.",
-                ))
-            }
-        };
-
-        // Parse the query
-        let query_parser =
-            QueryParser::for_index(&index, vec![all_text_field]);
-        let query = query_parser.parse_query(&query_str).map_err(|e| {
-            Status::invalid_argument(format!("Invalid query: {}", e))
+        let text_sk_field = schema.get_field("text_sk").map_err(|_| {
+            Status::internal("Schema is missing the 'text_sk' field.")
+        })?;
+        let pg_id_field = schema.get_field("pg_id").map_err(|_| {
+            Status::internal("Schema is missing the 'pg_id' field.")
        })?;

-        // Perform the search
+        // --- FINAL, ROBUST QUERY LOGIC ---
+
+        // 1. Get the exact tokenizer used for indexing the target field.
+        let mut tokenizer = index
+            .tokenizer_for_field(text_sk_field)
+            .map_err(|e| Status::internal(format!("Tokenizer not found: {}", e)))?;
+
+        // 2. Manually tokenize the user's normalized query string.
+        // CORRECTED: Store the normalized string in a variable to extend its lifetime.
+        let normalized_query = normalize_slovak_text(&query_str);
+        let mut token_stream = tokenizer.token_stream(&normalized_query);
+
+        let mut terms = Vec::new();
+        while let Some(token) = token_stream.next() {
+            terms.push(Term::from_field_text(text_sk_field, &token.text));
+        }
+
+        if terms.is_empty() {
+            return Ok(Response::new(SearchResponse { hits: vec![] }));
+        }
+
+        // 3. Create a TermQuery for each token.
+        let term_queries: Vec<(Occur, Box<dyn Query>)> = terms
+            .into_iter()
+            .map(|term| {
+                let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs);
+                (Occur::Must, Box::new(term_query) as Box<dyn Query>)
+            })
+            .collect();
+
+        // 4. Combine them into a BooleanQuery.
+        let final_query = BooleanQuery::new(term_queries);
+        // --- END OF LOGIC ---
+
        let top_docs = searcher
-            .search(&query, &TopDocs::with_limit(100))
+            .search(&final_query, &TopDocs::with_limit(100))
            .map_err(|e| Status::internal(format!("Search failed: {}", e)))?;

-        // Convert results to our response format
        let mut hits = Vec::new();
        for (score, doc_address) in top_docs {
-            let doc: TantivyDocument = searcher.doc(doc_address).map_err(
-                |e| {
-                    Status::internal(format!(
-                        "Failed to retrieve document: {}",
-                        e
-                    ))
-                },
-            )?;
+            let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| {
+                Status::internal(format!("Failed to retrieve document: {}", e))
+            })?;

            if let Some(pg_id_value) = doc.get_first(pg_id_field) {
                if let Some(pg_id) = pg_id_value.as_u64() {
-                    hits.push(Hit {
-                        id: pg_id as i64,
-                        score,
-                    });
+                    hits.push(Hit { id: pg_id as i64, score });
                }
            }
        }
@@ -107,3 +151,24 @@ impl Searcher for SearcherService {
        Ok(Response::new(response))
    }
 }
+
+/// Registers the Slovak ngram tokenizer
+fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
+    use tantivy::tokenizer::*;
+
+    let tokenizer_manager = index.tokenizers();
+
+    if tokenizer_manager.get("slovak").is_none() { // CHANGED BACK TO "slovak"
+        let slovak_ngram_tokenizer = TextAnalyzer::builder(
+            NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
+        )
+        .filter(RemoveLongFilter::limit(40))
+        .filter(LowerCaser)
+        .filter(AsciiFoldingFilter)
+        .build();
+
+        tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // CHANGED BACK TO "slovak"
+    }
+
+    Ok(())
+}
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -31,6 +31,7 @@ bcrypt = "0.17.0"
 validator = { version = "0.20.0", features = ["derive"] }
 uuid = { version = "1.16.0", features = ["serde", "v4"] }
 jsonwebtoken = "9.3.1"
+rust-stemmers = "1.2.0"

 [lib]
 name = "server"
--- a/server/src/bin/manual_indexer.rs
+++ b/server/src/bin/manual_indexer.rs
@@ -1,83 +0,0 @@
-// In server/src/bin/manual_indexer.rs
-
-use sqlx::{PgPool, Row};
-use tantivy::schema::*;
-use tantivy::{doc, Index};
-use std::path::Path;
-
-// --- CONFIGURATION ---
-// IMPORTANT: Change this to a table name that actually exists and has data in your test DB.
-// From your grpcurl output, "2025_test_post" is a good candidate.
-const TABLE_TO_INDEX: &str = "2025_test_post2";
-const INDEX_DIR: &str = "./tantivy_indexes";
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    // --- Database Connection ---
-    // This assumes you have a .env file with DATABASE_URL
-    dotenvy::dotenv().ok();
-    let database_url = std::env::var("DATABASE_URL")
-        .expect("DATABASE_URL must be set in your .env file");
-    let pool = PgPool::connect(&database_url).await?;
-    println!("Connected to database.");
-
-    // --- Tantivy Schema Definition ---
-    let mut schema_builder = Schema::builder();
-    // This field will store the original Postgres row ID. It's crucial.
-    schema_builder.add_u64_field("pg_id", INDEXED | STORED);
-    // This field will contain ALL text data from the row, concatenated.
-    schema_builder.add_text_field("all_text", TEXT | STORED);
-    let schema = schema_builder.build();
-
-    // --- Index Creation ---
-    let index_path = Path::new(INDEX_DIR).join(TABLE_TO_INDEX);
-    if index_path.exists() {
-        println!("Removing existing index at: {}", index_path.display());
-        std::fs::remove_dir_all(&index_path)?;
-    }
-    std::fs::create_dir_all(&index_path)?;
-    let index = Index::create_in_dir(&index_path, schema.clone())?;
-    let mut index_writer = index.writer(100_000_000)?; // 100MB heap
-
-    println!("Indexing table: {}", TABLE_TO_INDEX);
-
-    // --- Data Fetching and Indexing ---
-    let qualified_table = format!("gen.\"{}\"", TABLE_TO_INDEX);
-    let query_str = format!("SELECT id, to_jsonb(t) AS data FROM {} t", qualified_table);
-    let rows = sqlx::query(&query_str).fetch_all(&pool).await?;
-
-    if rows.is_empty() {
-        println!("Warning: No rows found in table '{}'. Index will be empty.", TABLE_TO_INDEX);
-    }
-
-    let pg_id_field = schema.get_field("pg_id").unwrap();
-    let all_text_field = schema.get_field("all_text").unwrap();
-
-    for row in &rows {
-        let id: i64 = row.try_get("id")?;
-        let data: serde_json::Value = row.try_get("data")?;
-
-        // Concatenate all text values from the JSON into one big string.
-        let mut full_text = String::new();
-        if let Some(obj) = data.as_object() {
-            for value in obj.values() {
-                if let Some(s) = value.as_str() {
-                    full_text.push_str(s);
-                    full_text.push(' ');
-                }
-            }
-        }
-
-        // Add the document to Tantivy
-        index_writer.add_document(doc!(
-            pg_id_field => id as u64,
-            all_text_field => full_text
-        ))?;
-    }
-
-    // --- Finalize ---
-    index_writer.commit()?;
-    println!("Successfully indexed {} documents into '{}'", rows.len(), index_path.display());
-
-    Ok(())
-}
--- a/server/src/indexer.rs
+++ b/server/src/indexer.rs
@@ -2,10 +2,11 @@

 use std::path::Path;
 use sqlx::{PgPool, Row};
-use tantivy::schema::{Schema, Term, TEXT, STORED, INDEXED};
+use tantivy::schema::{Schema, Term};
 use tantivy::{doc, Index, IndexWriter};
 use tokio::sync::mpsc::Receiver;
 use tracing::{error, info, warn};
+use crate::search_schema;

 const INDEX_DIR: &str = "./tantivy_indexes";

@@ -25,7 +26,6 @@ pub struct IndexCommandData {
 }

 /// The main loop for the background indexer task.
-/// It listens for commands on the receiver and updates the Tantivy index.
 pub async fn indexer_task(pool: PgPool, mut receiver: Receiver<IndexCommand>) {
    info!("Background indexer task started.");
    while let Some(command) = receiver.recv().await {
@@ -62,21 +62,13 @@ async fn handle_add_or_update(
        .await?;
    let json_data: serde_json::Value = row.try_get("data")?;

-    // 2. Prepare the Tantivy document
-    let mut full_text = String::new();
-    if let Some(obj) = json_data.as_object() {
-        for value in obj.values() {
-            if let Some(s) = value.as_str() {
-                full_text.push_str(s);
-                full_text.push(' ');
-            }
-        }
-    }
+    // 2. Extract all text content for Slovak processing
+    let slovak_text = extract_text_content(&json_data);

    // 3. Open the index and write the document
    let (mut writer, schema) = get_index_writer(&data.table_name)?;
    let pg_id_field = schema.get_field("pg_id").unwrap();
-    let all_text_field = schema.get_field("all_text").unwrap();
+    let text_sk_field = schema.get_field("text_sk").unwrap();

    // First, delete any existing document with this ID to handle updates
    let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
@@ -85,13 +77,13 @@ async fn handle_add_or_update(
    // Add the new document
    writer.add_document(doc!(
        pg_id_field => data.row_id as u64,
-        all_text_field => full_text
+        text_sk_field => slovak_text
    ))?;

    // 4. Commit changes
    writer.commit()?;
    info!(
-        "Successfully indexed document id:{} for table:{}",
+        "Successfully indexed Slovak document id:{} for table:{}",
        data.row_id, data.table_name
    );

@@ -122,19 +114,32 @@ async fn handle_delete(
 fn get_index_writer(
    table_name: &str,
 ) -> anyhow::Result<(IndexWriter, Schema)> {
-    let index_path = Path::new(INDEX_DIR).join(table_name);
-    std::fs::create_dir_all(&index_path)?;
-
-    let index = Index::open_in_dir(&index_path).or_else(|_| {
-        // If it doesn't exist, create it with the standard schema
-        let mut schema_builder = Schema::builder();
-        schema_builder.add_u64_field("pg_id", INDEXED | STORED);
-        schema_builder.add_text_field("all_text", TEXT | STORED);
-        let schema = schema_builder.build();
-        Index::create_in_dir(&index_path, schema)
-    })?;
-
+    let index = search_schema::get_or_create_index(table_name)?;
    let schema = index.schema();
    let writer = index.writer(100_000_000)?; // 100MB heap
    Ok((writer, schema))
 }
+
+/// Extract all text content from a JSON object for indexing
+fn extract_text_content(json_data: &serde_json::Value) -> String {
+    let mut full_text = String::new();
+    
+    if let Some(obj) = json_data.as_object() {
+        for value in obj.values() {
+            match value {
+                serde_json::Value::String(s) => {
+                    full_text.push_str(s);
+                    full_text.push(' ');
+                }
+                serde_json::Value::Number(n) => {
+                    full_text.push_str(&n.to_string());
+                    full_text.push(' ');
+                }
+                // We could recursively handle nested objects if needed
+                _ => {}
+            }
+        }
+    }
+    
+    full_text.trim().to_string()
+}
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -2,6 +2,7 @@
 pub mod db;
 pub mod auth;
 pub mod indexer;
+pub mod search_schema;
 pub mod server;
 pub mod adresar;
 pub mod uctovnictvo;
--- a/server/src/search_schema.rs
+++ b/server/src/search_schema.rs
@@ -0,0 +1,63 @@
+// server/src/search_schema.rs
+
+use tantivy::schema::*;
+use tantivy::tokenizer::*;
+use tantivy::Index;
+use std::path::Path;
+
+/// Creates a Tantivy schema optimized for Slovak ngram search
+pub fn create_search_schema() -> Schema {
+    let mut schema_builder = Schema::builder();
+
+    // ID field to link back to PostgreSQL
+    schema_builder.add_u64_field("pg_id", INDEXED | STORED);
+
+    // Slovak text field with ngram tokenizer for search-as-you-type
+    let text_field_indexing = TextFieldIndexing::default()
+        .set_tokenizer("slovak") // KEEP THE SAME NAME
+        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+
+    let text_options = TextOptions::default()
+        .set_indexing_options(text_field_indexing)
+        .set_stored();
+
+    schema_builder.add_text_field("text_sk", text_options);
+
+    schema_builder.build()
+}
+
+/// Registers the Slovak ngram tokenizer with the index
+pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
+    let tokenizer_manager = index.tokenizers();
+
+    // Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
+    let slovak_ngram_tokenizer = TextAnalyzer::builder(
+        NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
+    )
+    .filter(RemoveLongFilter::limit(40)) // Remove very long tokens
+    .filter(LowerCaser) // Convert to lowercase
+    .filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
+    .build();
+
+    tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
+
+    Ok(())
+}
+
+/// Gets or creates an index for a table with proper Slovak ngram processing
+pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
+    let index_path = Path::new("./tantivy_indexes").join(table_name);
+    std::fs::create_dir_all(&index_path)?;
+
+    let index = if index_path.join("meta.json").exists() {
+        Index::open_in_dir(&index_path)?
+    } else {
+        let schema = create_search_schema();
+        Index::create_in_dir(&index_path, schema)?
+    };
+
+    // Always register the tokenizer when opening
+    register_slovak_tokenizer(&index)?;
+
+    Ok(index)
+}