slovak language tokenized search

2025-06-09 16:36:18 +02:00
parent 50d15e321f
commit 4760f42589
7 changed files with 210 additions and 157 deletions
--- a/server/src/indexer.rs
+++ b/server/src/indexer.rs
@@ -2,10 +2,11 @@

 use std::path::Path;
 use sqlx::{PgPool, Row};
-use tantivy::schema::{Schema, Term, TEXT, STORED, INDEXED};
+use tantivy::schema::{Schema, Term};
 use tantivy::{doc, Index, IndexWriter};
 use tokio::sync::mpsc::Receiver;
 use tracing::{error, info, warn};
+use crate::search_schema;

 const INDEX_DIR: &str = "./tantivy_indexes";

@@ -25,7 +26,6 @@ pub struct IndexCommandData {
 }

 /// The main loop for the background indexer task.
-/// It listens for commands on the receiver and updates the Tantivy index.
 pub async fn indexer_task(pool: PgPool, mut receiver: Receiver<IndexCommand>) {
    info!("Background indexer task started.");
    while let Some(command) = receiver.recv().await {
@@ -62,21 +62,13 @@ async fn handle_add_or_update(
        .await?;
    let json_data: serde_json::Value = row.try_get("data")?;

-    // 2. Prepare the Tantivy document
-    let mut full_text = String::new();
-    if let Some(obj) = json_data.as_object() {
-        for value in obj.values() {
-            if let Some(s) = value.as_str() {
-                full_text.push_str(s);
-                full_text.push(' ');
-            }
-        }
-    }
+    // 2. Extract all text content for Slovak processing
+    let slovak_text = extract_text_content(&json_data);

    // 3. Open the index and write the document
    let (mut writer, schema) = get_index_writer(&data.table_name)?;
    let pg_id_field = schema.get_field("pg_id").unwrap();
-    let all_text_field = schema.get_field("all_text").unwrap();
+    let text_sk_field = schema.get_field("text_sk").unwrap();

    // First, delete any existing document with this ID to handle updates
    let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
@@ -85,13 +77,13 @@ async fn handle_add_or_update(
    // Add the new document
    writer.add_document(doc!(
        pg_id_field => data.row_id as u64,
-        all_text_field => full_text
+        text_sk_field => slovak_text
    ))?;

    // 4. Commit changes
    writer.commit()?;
    info!(
-        "Successfully indexed document id:{} for table:{}",
+        "Successfully indexed Slovak document id:{} for table:{}",
        data.row_id, data.table_name
    );

@@ -122,19 +114,32 @@ async fn handle_delete(
 fn get_index_writer(
    table_name: &str,
 ) -> anyhow::Result<(IndexWriter, Schema)> {
-    let index_path = Path::new(INDEX_DIR).join(table_name);
-    std::fs::create_dir_all(&index_path)?;
-
-    let index = Index::open_in_dir(&index_path).or_else(|_| {
-        // If it doesn't exist, create it with the standard schema
-        let mut schema_builder = Schema::builder();
-        schema_builder.add_u64_field("pg_id", INDEXED | STORED);
-        schema_builder.add_text_field("all_text", TEXT | STORED);
-        let schema = schema_builder.build();
-        Index::create_in_dir(&index_path, schema)
-    })?;
-
+    let index = search_schema::get_or_create_index(table_name)?;
    let schema = index.schema();
    let writer = index.writer(100_000_000)?; // 100MB heap
    Ok((writer, schema))
 }
+
+/// Extract all text content from a JSON object for indexing
+fn extract_text_content(json_data: &serde_json::Value) -> String {
+    let mut full_text = String::new();
+    
+    if let Some(obj) = json_data.as_object() {
+        for value in obj.values() {
+            match value {
+                serde_json::Value::String(s) => {
+                    full_text.push_str(s);
+                    full_text.push(' ');
+                }
+                serde_json::Value::Number(n) => {
+                    full_text.push_str(&n.to_string());
+                    full_text.push(' ');
+                }
+                // We could recursively handle nested objects if needed
+                _ => {}
+            }
+        }
+    }
+    
+    full_text.trim().to_string()
+}