komp_ac/search/src/lib.rs

// src/lib.rs

use std::collections::HashMap;
use std::path::Path;
use tantivy::collector::TopDocs;
use tantivy::query::{
    BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser,
    TermQuery,
};
use tantivy::schema::{IndexRecordOption, Value};
use tantivy::{Index, TantivyDocument, Term};
use tonic::{Request, Response, Status};

use common::proto::komp_ac::search::{
    search_response::Hit, SearchRequest, SearchResponse,
};
pub use common::proto::komp_ac::search::searcher_server::SearcherServer;
use common::proto::komp_ac::search::searcher_server::Searcher;
use common::search::register_slovak_tokenizers;
use sqlx::{PgPool, Row};
use tracing::info;

// We need to hold the database pool in our service struct.
pub struct SearcherService {
    pub pool: PgPool,
}

// normalize_slovak_text function remains unchanged...
fn normalize_slovak_text(text: &str) -> String {
    // ... function content is unchanged ...
    text.chars()
        .map(|c| match c {
            'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
            'Á' | 'À' | 'Â' | 'Ä' | 'Ă' | 'Ā' => 'A',
            'é' | 'è' | 'ê' | 'ë' | 'ě' | 'ē' => 'e',
            'É' | 'È' | 'Ê' | 'Ë' | 'Ě' | 'Ē' => 'E',
            'í' | 'ì' | 'î' | 'ï' | 'ī' => 'i',
            'Í' | 'Ì' | 'Î' | 'Ï' | 'Ī' => 'I',
            'ó' | 'ò' | 'ô' | 'ö' | 'ō' | 'ő' => 'o',
            'Ó' | 'Ò' | 'Ô' | 'Ö' | 'Ō' | 'Ő' => 'O',
            'ú' | 'ù' | 'û' | 'ü' | 'ū' | 'ű' => 'u',
            'Ú' | 'Ù' | 'Û' | 'Ü' | 'Ū' | 'Ű' => 'U',
            'ý' | 'ỳ' | 'ŷ' | 'ÿ' => 'y',
            'Ý' | 'Ỳ' | 'Ŷ' | 'Ÿ' => 'Y',
            'č' => 'c',
            'Č' => 'C',
            'ď' => 'd',
            'Ď' => 'D',
            'ľ' => 'l',
            'Ľ' => 'L',
            'ň' => 'n',
            'Ň' => 'N',
            'ř' => 'r',
            'Ř' => 'R',
            'š' => 's',
            'Š' => 'S',
            'ť' => 't',
            'Ť' => 'T',
            'ž' => 'z',
            'Ž' => 'Z',
            _ => c,
        })
        .collect()
}

#[tonic::async_trait]
impl Searcher for SearcherService {
    async fn search_table(
        &self,
        request: Request<SearchRequest>,
    ) -> Result<Response<SearchResponse>, Status> {
        let req = request.into_inner();
        let table_name = req.table_name;
        let query_str = req.query;

        // --- MODIFIED LOGIC ---
        // If the query is empty, fetch the 5 most recent records.
        if query_str.trim().is_empty() {
            info!(
                "Empty query for table '{}'. Fetching default results.",
                table_name
            );
            let qualified_table = format!("gen.\"{}\"", table_name);
            let sql = format!(
                "SELECT id, to_jsonb(t) AS data FROM {} t ORDER BY id DESC LIMIT 5",
                qualified_table
            );

            let rows = sqlx::query(&sql)
                .fetch_all(&self.pool)
                .await
                .map_err(|e| {
                    Status::internal(format!(
                        "DB query for default results failed: {}",
                        e
                    ))
                })?;

            let hits: Vec<Hit> = rows
                .into_iter()
                .map(|row| {
                    let id: i64 = row.try_get("id").unwrap_or_default();
                    let json_data: serde_json::Value =
                        row.try_get("data").unwrap_or_default();
                    Hit {
                        id,
                        // Score is 0.0 as this is not a relevance-ranked search
                        score: 0.0,
                        content_json: json_data.to_string(),
                    }
                })
                .collect();

            info!("--- SERVER: Successfully processed empty query. Returning {} default hits. ---", hits.len());
            return Ok(Response::new(SearchResponse { hits }));
        }
        // --- END OF MODIFIED LOGIC ---

        let index_path = Path::new("./tantivy_indexes").join(&table_name);
        if !index_path.exists() {
            return Err(Status::not_found(format!(
                "No search index found for table '{}'",
                table_name
            )));
        }

        let index = Index::open_in_dir(&index_path)
            .map_err(|e| Status::internal(format!("Failed to open index: {}", e)))?;

        register_slovak_tokenizers(&index).map_err(|e| {
            Status::internal(format!("Failed to register Slovak tokenizers: {}", e))
        })?;

        let reader = index.reader().map_err(|e| {
            Status::internal(format!("Failed to create index reader: {}", e))
        })?;
        let searcher = reader.searcher();
        let schema = index.schema();

        let pg_id_field = schema.get_field("pg_id").map_err(|_| {
            Status::internal("Schema is missing the 'pg_id' field.")
        })?;

        // --- Query Building Logic (no changes here) ---
        let prefix_edge_field = schema.get_field("prefix_edge").unwrap();
        let prefix_full_field = schema.get_field("prefix_full").unwrap();
        let text_ngram_field = schema.get_field("text_ngram").unwrap();
        let normalized_query = normalize_slovak_text(&query_str);
        let words: Vec<&str> = normalized_query.split_whitespace().collect();
        if words.is_empty() {
            return Ok(Response::new(SearchResponse { hits: vec![] }));
        }
        let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
        // ... all your query building layers remain exactly the same ...
        // ===============================
        // LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
        // ===============================
        {
            let mut must_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
            for word in &words {
                let edge_term =
                    Term::from_field_text(prefix_edge_field, word);
                let full_term =
                    Term::from_field_text(prefix_full_field, word);

                let per_word_query = BooleanQuery::new(vec![
                    (
                        Occur::Should,
                        Box::new(TermQuery::new(
                            edge_term,
                            IndexRecordOption::Basic,
                        )),
                    ),
                    (
                        Occur::Should,
                        Box::new(TermQuery::new(
                            full_term,
                            IndexRecordOption::Basic,
                        )),
                    ),
                ]);
                must_clauses.push((Occur::Must, Box::new(per_word_query) as Box<dyn Query>));
            }

            if !must_clauses.is_empty() {
                let prefix_query = BooleanQuery::new(must_clauses);
                let boosted_query =
                    BoostQuery::new(Box::new(prefix_query), 4.0);
                query_layers.push((Occur::Should, Box::new(boosted_query)));
            }
        }

        // ===============================
        // LAYER 2: FUZZY MATCHING (HIGH PRIORITY, Boost: 3.0)
        // ===============================
        {
            let last_word = words.last().unwrap();
            let fuzzy_term =
                Term::from_field_text(prefix_full_field, last_word);
            let fuzzy_query = FuzzyTermQuery::new(fuzzy_term, 2, true);
            let boosted_query = BoostQuery::new(Box::new(fuzzy_query), 3.0);
            query_layers.push((Occur::Should, Box::new(boosted_query)));
        }

        // ===============================
        // LAYER 3: PHRASE MATCHING WITH SLOP (MEDIUM PRIORITY, Boost: 2.0)
        // ===============================
        if words.len() > 1 {
            let slop_parser =
                QueryParser::for_index(&index, vec![prefix_full_field]);
            let slop_query_str = format!("\"{}\"~3", normalized_query);
            if let Ok(slop_query) = slop_parser.parse_query(&slop_query_str) {
                let boosted_query = BoostQuery::new(slop_query, 2.0);
                query_layers.push((Occur::Should, Box::new(boosted_query)));
            }
        }

        // ===============================
        // LAYER 4: NGRAM SUBSTRING MATCHING (LOWEST PRIORITY, Boost: 1.0)
        // ===============================
        {
            let ngram_parser =
                QueryParser::for_index(&index, vec![text_ngram_field]);
            if let Ok(ngram_query) =
                ngram_parser.parse_query(&normalized_query)
            {
                let boosted_query = BoostQuery::new(ngram_query, 1.0);
                query_layers.push((Occur::Should, Box::new(boosted_query)));
            }
        }
        let master_query = BooleanQuery::new(query_layers);
        // --- End of Query Building Logic ---

        let top_docs = searcher
            .search(&master_query, &TopDocs::with_limit(100))
            .map_err(|e| Status::internal(format!("Search failed: {}", e)))?;

        if top_docs.is_empty() {
            return Ok(Response::new(SearchResponse { hits: vec![] }));
        }

        // --- NEW LOGIC: Fetch from DB and combine results ---

        // Step 1: Extract (score, pg_id) from Tantivy results.
        let mut scored_ids: Vec<(f32, u64)> = Vec::new();
        for (score, doc_address) in top_docs {
            let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| {
                Status::internal(format!("Failed to retrieve document: {}", e))
            })?;
            if let Some(pg_id_value) = doc.get_first(pg_id_field) {
                if let Some(pg_id) = pg_id_value.as_u64() {
                    scored_ids.push((score, pg_id));
                }
            }
        }

        // Step 2: Fetch all corresponding rows from Postgres in a single query.
        let pg_ids: Vec<i64> =
            scored_ids.iter().map(|(_, id)| *id as i64).collect();
        let qualified_table = format!("gen.\"{}\"", table_name);
        let query_str = format!(
            "SELECT id, to_jsonb(t) AS data FROM {} t WHERE id = ANY($1)",
            qualified_table
        );

        let rows = sqlx::query(&query_str)
            .bind(&pg_ids)
            .fetch_all(&self.pool)
            .await
            .map_err(|e| {
                Status::internal(format!("Database query failed: {}", e))
            })?;

        // Step 3: Map the database results by ID for quick lookup.
        let mut content_map: HashMap<i64, String> = HashMap::new();
        for row in rows {
            let id: i64 = row.try_get("id").unwrap_or(0);
            let json_data: serde_json::Value =
                row.try_get("data").unwrap_or(serde_json::Value::Null);
            content_map.insert(id, json_data.to_string());
        }

        // Step 4: Build the final response, combining Tantivy scores with PG content.
        let hits: Vec<Hit> = scored_ids
            .into_iter()
            .filter_map(|(score, pg_id)| {
                content_map
                    .get(&(pg_id as i64))
                    .map(|content_json| Hit {
                        id: pg_id as i64,
                        score,
                        content_json: content_json.clone(),
                    })
            })
            .collect();

        info!("--- SERVER: Successfully processed search. Returning {} hits. ---", hits.len());

        let response = SearchResponse { hits };
        Ok(Response::new(response))
    }
}