From afd9228efa207dad635a7ca5e6ed0228a94c20f0 Mon Sep 17 00:00:00 2001 From: filipriec Date: Wed, 11 Jun 2025 14:07:22 +0200 Subject: [PATCH] json in the otput of the tantivy --- Cargo.lock | 30 +++++----- common/proto/search.proto | 4 +- common/src/proto/descriptor.bin | Bin 22240 -> 22335 bytes common/src/proto/multieko2.search.rs | 6 +- search/Cargo.toml | 1 + search/src/lib.rs | 81 +++++++++++++++++++++------ server/src/server/run.rs | 7 ++- 7 files changed, 91 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9ffe175..8e39ec1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2740,6 +2740,7 @@ dependencies = [ "prost", "serde", "serde_json", + "sqlx", "tantivy", "tokio", "tonic", @@ -3019,9 +3020,9 @@ dependencies = [ [[package]] name = "sqlx" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c3a85280daca669cfd3bcb68a337882a8bc57ec882f72c5d13a430613a738e" +checksum = "1fefb893899429669dcdd979aff487bd78f4064e5e7907e4269081e0ef7d97dc" dependencies = [ "sqlx-core", "sqlx-macros", @@ -3032,9 +3033,9 @@ dependencies = [ [[package]] name = "sqlx-core" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f743f2a3cea30a58cd479013f75550e879009e3a02f616f18ca699335aa248c3" +checksum = "ee6798b1838b6a0f69c007c133b8df5866302197e404e8b6ee8ed3e3a5e68dc6" dependencies = [ "base64", "bytes", @@ -3070,9 +3071,9 @@ dependencies = [ [[package]] name = "sqlx-macros" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f4200e0fde19834956d4252347c12a083bdcb237d7a1a1446bffd8768417dce" +checksum = "a2d452988ccaacfbf5e0bdbc348fb91d7c8af5bee192173ac3636b5fb6e6715d" dependencies = [ "proc-macro2", "quote", @@ -3083,9 +3084,9 @@ dependencies = [ [[package]] name = "sqlx-macros-core" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "882ceaa29cade31beca7129b6beeb05737f44f82dbe2a9806ecea5a7093d00b7" +checksum = "19a9c1841124ac5a61741f96e1d9e2ec77424bf323962dd894bdb93f37d5219b" dependencies = [ "dotenvy", "either", @@ -3102,16 +3103,15 @@ dependencies = [ "sqlx-postgres", "sqlx-sqlite", "syn 2.0.100", - "tempfile", "tokio", "url", ] [[package]] name = "sqlx-mysql" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0afdd3aa7a629683c2d750c2df343025545087081ab5942593a5288855b1b7a7" +checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526" dependencies = [ "atoi", "base64", @@ -3154,9 +3154,9 @@ dependencies = [ [[package]] name = "sqlx-postgres" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0bedbe1bbb5e2615ef347a5e9d8cd7680fb63e77d9dafc0f29be15e53f1ebe6" +checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46" dependencies = [ "atoi", "base64", @@ -3194,9 +3194,9 @@ dependencies = [ [[package]] name = "sqlx-sqlite" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c26083e9a520e8eb87a06b12347679b142dc2ea29e6e409f805644a7a979a5bc" +checksum = "c2d12fe70b2c1b4401038055f90f151b78208de1f9f89a7dbfd41587a10c3eea" dependencies = [ "atoi", "chrono", diff --git a/common/proto/search.proto b/common/proto/search.proto index 1ca02ed..25bba8b 100644 --- a/common/proto/search.proto +++ b/common/proto/search.proto @@ -10,11 +10,11 @@ message SearchRequest { string table_name = 1; string query = 2; } - message SearchResponse { message Hit { - int64 id = 1; // The PostgreSQL row ID + int64 id = 1; // PostgreSQL row ID float score = 2; + string content_json = 3; } repeated Hit hits = 1; } diff --git a/common/src/proto/descriptor.bin b/common/src/proto/descriptor.bin index 698f17af41ca1cfad39d914f54bb051fa20adae4..642c133a0d3393bcb0da768003e0c1190adcabfc 100644 GIT binary patch delta 309 zcmaE`mT~_&#to&xOefeUmj|aa&0?HEF_-s&gN|)QjE4d48n|DTwKg7jEq9eJRD+NJX|a+jLZx|EW9j& ztUw+shzFG8<>BX2;$j2wm>7hZ`8Y(CgeMDzDb(|E@N)5Su|p()3iStmq$a#ct?Y8Ma_ c=i&yr3uLnpixk9JObjd_9uF7KWd3ji0KDHB_5c6? diff --git a/common/src/proto/multieko2.search.rs b/common/src/proto/multieko2.search.rs index 7cbb2c4..54d8d47 100644 --- a/common/src/proto/multieko2.search.rs +++ b/common/src/proto/multieko2.search.rs @@ -13,13 +13,15 @@ pub struct SearchResponse { } /// Nested message and enum types in `SearchResponse`. pub mod search_response { - #[derive(Clone, Copy, PartialEq, ::prost::Message)] + #[derive(Clone, PartialEq, ::prost::Message)] pub struct Hit { - /// The PostgreSQL row ID + /// PostgreSQL row ID #[prost(int64, tag = "1")] pub id: i64, #[prost(float, tag = "2")] pub score: f32, + #[prost(string, tag = "3")] + pub content_json: ::prost::alloc::string::String, } } /// Generated client implementations. diff --git a/search/Cargo.toml b/search/Cargo.toml index 0273ad7..f769f12 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -16,3 +16,4 @@ tantivy = { workspace = true } common = { path = "../common" } tonic-reflection = "0.13.1" +sqlx = { version = "0.8.6", features = ["postgres"] } diff --git a/search/src/lib.rs b/search/src/lib.rs index 2d7049f..5224233 100644 --- a/search/src/lib.rs +++ b/search/src/lib.rs @@ -1,12 +1,13 @@ -// search/src/lib.rs +// src/lib.rs +use std::collections::HashMap; use std::path::Path; use tantivy::collector::TopDocs; use tantivy::query::{ BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery, }; -use tantivy::schema::IndexRecordOption; +use tantivy::schema::{IndexRecordOption, Value}; use tantivy::{Index, TantivyDocument, Term}; use tonic::{Request, Response, Status}; @@ -16,12 +17,16 @@ use common::proto::multieko2::search::{ pub use common::proto::multieko2::search::searcher_server::SearcherServer; use common::proto::multieko2::search::searcher_server::Searcher; use common::search::register_slovak_tokenizers; -use tantivy::schema::Value; +use sqlx::{PgPool, Row}; // <-- Import PgPool and Row -pub struct SearcherService; +// We need to hold the database pool in our service struct. +pub struct SearcherService { + pub pool: PgPool, +} -// Normalize diacritics in queries +// Normalize diacritics in queries (no changes here) fn normalize_slovak_text(text: &str) -> String { + // ... function content is unchanged ... text.chars() .map(|c| match c { 'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a', @@ -105,15 +110,17 @@ impl Searcher for SearcherService { Status::internal("Schema is missing the 'pg_id' field.") })?; + // --- Query Building Logic (no changes here) --- + let prefix_edge_field = schema.get_field("prefix_edge").unwrap(); + let prefix_full_field = schema.get_field("prefix_full").unwrap(); + let text_ngram_field = schema.get_field("text_ngram").unwrap(); let normalized_query = normalize_slovak_text(&query_str); let words: Vec<&str> = normalized_query.split_whitespace().collect(); - if words.is_empty() { return Ok(Response::new(SearchResponse { hits: vec![] })); } - let mut query_layers: Vec<(Occur, Box)> = Vec::new(); - + // ... all your query building layers remain exactly the same ... // =============================== // LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0) // =============================== @@ -190,31 +197,73 @@ impl Searcher for SearcherService { query_layers.push((Occur::Should, Box::new(boosted_query))); } } - let master_query = BooleanQuery::new(query_layers); + // --- End of Query Building Logic --- let top_docs = searcher .search(&master_query, &TopDocs::with_limit(100)) .map_err(|e| Status::internal(format!("Search failed: {}", e)))?; - let mut hits = Vec::new(); + if top_docs.is_empty() { + return Ok(Response::new(SearchResponse { hits: vec![] })); + } + + // --- NEW LOGIC: Fetch from DB and combine results --- + + // Step 1: Extract (score, pg_id) from Tantivy results. + let mut scored_ids: Vec<(f32, u64)> = Vec::new(); for (score, doc_address) in top_docs { let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| { Status::internal(format!("Failed to retrieve document: {}", e)) })?; - if let Some(pg_id_value) = doc.get_first(pg_id_field) { if let Some(pg_id) = pg_id_value.as_u64() { - hits.push(Hit { - id: pg_id as i64, - score, - }); + scored_ids.push((score, pg_id)); } } } + // Step 2: Fetch all corresponding rows from Postgres in a single query. + let pg_ids: Vec = + scored_ids.iter().map(|(_, id)| *id as i64).collect(); + let qualified_table = format!("gen.\"{}\"", table_name); + let query_str = format!( + "SELECT id, to_jsonb(t) AS data FROM {} t WHERE id = ANY($1)", + qualified_table + ); + + let rows = sqlx::query(&query_str) + .bind(&pg_ids) + .fetch_all(&self.pool) + .await + .map_err(|e| { + Status::internal(format!("Database query failed: {}", e)) + })?; + + // Step 3: Map the database results by ID for quick lookup. + let mut content_map: HashMap = HashMap::new(); + for row in rows { + let id: i64 = row.try_get("id").unwrap_or(0); + let json_data: serde_json::Value = + row.try_get("data").unwrap_or(serde_json::Value::Null); + content_map.insert(id, json_data.to_string()); + } + + // Step 4: Build the final response, combining Tantivy scores with PG content. + let hits: Vec = scored_ids + .into_iter() + .filter_map(|(score, pg_id)| { + content_map + .get(&(pg_id as i64)) + .map(|content_json| Hit { + id: pg_id as i64, + score, + content_json: content_json.clone(), + }) + }) + .collect(); + let response = SearchResponse { hits }; Ok(Response::new(response)) } } - diff --git a/server/src/server/run.rs b/server/src/server/run.rs index 22b1ef8..7390f6e 100644 --- a/server/src/server/run.rs +++ b/server/src/server/run.rs @@ -1,4 +1,3 @@ -// src/server/run.rs use tonic::transport::Server; use tonic_reflection::server::Builder as ReflectionBuilder; @@ -52,7 +51,9 @@ pub async fn run_server(db_pool: sqlx::PgPool) -> Result<(), Box Result<(), Box