json in the otput of the tantivy

This commit is contained in:
filipriec
2025-06-11 14:07:22 +02:00
parent 495d77fda5
commit afd9228efa
7 changed files with 91 additions and 38 deletions

30
Cargo.lock generated
View File

@@ -2740,6 +2740,7 @@ dependencies = [
"prost",
"serde",
"serde_json",
"sqlx",
"tantivy",
"tokio",
"tonic",
@@ -3019,9 +3020,9 @@ dependencies = [
[[package]]
name = "sqlx"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3c3a85280daca669cfd3bcb68a337882a8bc57ec882f72c5d13a430613a738e"
checksum = "1fefb893899429669dcdd979aff487bd78f4064e5e7907e4269081e0ef7d97dc"
dependencies = [
"sqlx-core",
"sqlx-macros",
@@ -3032,9 +3033,9 @@ dependencies = [
[[package]]
name = "sqlx-core"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f743f2a3cea30a58cd479013f75550e879009e3a02f616f18ca699335aa248c3"
checksum = "ee6798b1838b6a0f69c007c133b8df5866302197e404e8b6ee8ed3e3a5e68dc6"
dependencies = [
"base64",
"bytes",
@@ -3070,9 +3071,9 @@ dependencies = [
[[package]]
name = "sqlx-macros"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f4200e0fde19834956d4252347c12a083bdcb237d7a1a1446bffd8768417dce"
checksum = "a2d452988ccaacfbf5e0bdbc348fb91d7c8af5bee192173ac3636b5fb6e6715d"
dependencies = [
"proc-macro2",
"quote",
@@ -3083,9 +3084,9 @@ dependencies = [
[[package]]
name = "sqlx-macros-core"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "882ceaa29cade31beca7129b6beeb05737f44f82dbe2a9806ecea5a7093d00b7"
checksum = "19a9c1841124ac5a61741f96e1d9e2ec77424bf323962dd894bdb93f37d5219b"
dependencies = [
"dotenvy",
"either",
@@ -3102,16 +3103,15 @@ dependencies = [
"sqlx-postgres",
"sqlx-sqlite",
"syn 2.0.100",
"tempfile",
"tokio",
"url",
]
[[package]]
name = "sqlx-mysql"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0afdd3aa7a629683c2d750c2df343025545087081ab5942593a5288855b1b7a7"
checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526"
dependencies = [
"atoi",
"base64",
@@ -3154,9 +3154,9 @@ dependencies = [
[[package]]
name = "sqlx-postgres"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0bedbe1bbb5e2615ef347a5e9d8cd7680fb63e77d9dafc0f29be15e53f1ebe6"
checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46"
dependencies = [
"atoi",
"base64",
@@ -3194,9 +3194,9 @@ dependencies = [
[[package]]
name = "sqlx-sqlite"
version = "0.8.5"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c26083e9a520e8eb87a06b12347679b142dc2ea29e6e409f805644a7a979a5bc"
checksum = "c2d12fe70b2c1b4401038055f90f151b78208de1f9f89a7dbfd41587a10c3eea"
dependencies = [
"atoi",
"chrono",

View File

@@ -10,11 +10,11 @@ message SearchRequest {
string table_name = 1;
string query = 2;
}
message SearchResponse {
message Hit {
int64 id = 1; // The PostgreSQL row ID
int64 id = 1; // PostgreSQL row ID
float score = 2;
string content_json = 3;
}
repeated Hit hits = 1;
}

Binary file not shown.

View File

@@ -13,13 +13,15 @@ pub struct SearchResponse {
}
/// Nested message and enum types in `SearchResponse`.
pub mod search_response {
#[derive(Clone, Copy, PartialEq, ::prost::Message)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct Hit {
/// The PostgreSQL row ID
/// PostgreSQL row ID
#[prost(int64, tag = "1")]
pub id: i64,
#[prost(float, tag = "2")]
pub score: f32,
#[prost(string, tag = "3")]
pub content_json: ::prost::alloc::string::String,
}
}
/// Generated client implementations.

View File

@@ -16,3 +16,4 @@ tantivy = { workspace = true }
common = { path = "../common" }
tonic-reflection = "0.13.1"
sqlx = { version = "0.8.6", features = ["postgres"] }

View File

@@ -1,12 +1,13 @@
// search/src/lib.rs
// src/lib.rs
use std::collections::HashMap;
use std::path::Path;
use tantivy::collector::TopDocs;
use tantivy::query::{
BooleanQuery, BoostQuery, FuzzyTermQuery, Occur, Query, QueryParser,
TermQuery,
};
use tantivy::schema::IndexRecordOption;
use tantivy::schema::{IndexRecordOption, Value};
use tantivy::{Index, TantivyDocument, Term};
use tonic::{Request, Response, Status};
@@ -16,12 +17,16 @@ use common::proto::multieko2::search::{
pub use common::proto::multieko2::search::searcher_server::SearcherServer;
use common::proto::multieko2::search::searcher_server::Searcher;
use common::search::register_slovak_tokenizers;
use tantivy::schema::Value;
use sqlx::{PgPool, Row}; // <-- Import PgPool and Row
pub struct SearcherService;
// We need to hold the database pool in our service struct.
pub struct SearcherService {
pub pool: PgPool,
}
// Normalize diacritics in queries
// Normalize diacritics in queries (no changes here)
fn normalize_slovak_text(text: &str) -> String {
// ... function content is unchanged ...
text.chars()
.map(|c| match c {
'á' | 'à' | 'â' | 'ä' | 'ă' | 'ā' => 'a',
@@ -105,15 +110,17 @@ impl Searcher for SearcherService {
Status::internal("Schema is missing the 'pg_id' field.")
})?;
// --- Query Building Logic (no changes here) ---
let prefix_edge_field = schema.get_field("prefix_edge").unwrap();
let prefix_full_field = schema.get_field("prefix_full").unwrap();
let text_ngram_field = schema.get_field("text_ngram").unwrap();
let normalized_query = normalize_slovak_text(&query_str);
let words: Vec<&str> = normalized_query.split_whitespace().collect();
if words.is_empty() {
return Ok(Response::new(SearchResponse { hits: vec![] }));
}
let mut query_layers: Vec<(Occur, Box<dyn Query>)> = Vec::new();
// ... all your query building layers remain exactly the same ...
// ===============================
// LAYER 1: PREFIX MATCHING (HIGHEST PRIORITY, Boost: 4.0)
// ===============================
@@ -190,31 +197,73 @@ impl Searcher for SearcherService {
query_layers.push((Occur::Should, Box::new(boosted_query)));
}
}
let master_query = BooleanQuery::new(query_layers);
// --- End of Query Building Logic ---
let top_docs = searcher
.search(&master_query, &TopDocs::with_limit(100))
.map_err(|e| Status::internal(format!("Search failed: {}", e)))?;
let mut hits = Vec::new();
if top_docs.is_empty() {
return Ok(Response::new(SearchResponse { hits: vec![] }));
}
// --- NEW LOGIC: Fetch from DB and combine results ---
// Step 1: Extract (score, pg_id) from Tantivy results.
let mut scored_ids: Vec<(f32, u64)> = Vec::new();
for (score, doc_address) in top_docs {
let doc: TantivyDocument = searcher.doc(doc_address).map_err(|e| {
Status::internal(format!("Failed to retrieve document: {}", e))
})?;
if let Some(pg_id_value) = doc.get_first(pg_id_field) {
if let Some(pg_id) = pg_id_value.as_u64() {
hits.push(Hit {
scored_ids.push((score, pg_id));
}
}
}
// Step 2: Fetch all corresponding rows from Postgres in a single query.
let pg_ids: Vec<i64> =
scored_ids.iter().map(|(_, id)| *id as i64).collect();
let qualified_table = format!("gen.\"{}\"", table_name);
let query_str = format!(
"SELECT id, to_jsonb(t) AS data FROM {} t WHERE id = ANY($1)",
qualified_table
);
let rows = sqlx::query(&query_str)
.bind(&pg_ids)
.fetch_all(&self.pool)
.await
.map_err(|e| {
Status::internal(format!("Database query failed: {}", e))
})?;
// Step 3: Map the database results by ID for quick lookup.
let mut content_map: HashMap<i64, String> = HashMap::new();
for row in rows {
let id: i64 = row.try_get("id").unwrap_or(0);
let json_data: serde_json::Value =
row.try_get("data").unwrap_or(serde_json::Value::Null);
content_map.insert(id, json_data.to_string());
}
// Step 4: Build the final response, combining Tantivy scores with PG content.
let hits: Vec<Hit> = scored_ids
.into_iter()
.filter_map(|(score, pg_id)| {
content_map
.get(&(pg_id as i64))
.map(|content_json| Hit {
id: pg_id as i64,
score,
});
}
}
}
content_json: content_json.clone(),
})
})
.collect();
let response = SearchResponse { hits };
Ok(Response::new(response))
}
}

View File

@@ -1,4 +1,3 @@
// src/server/run.rs
use tonic::transport::Server;
use tonic_reflection::server::Builder as ReflectionBuilder;
@@ -52,7 +51,9 @@ pub async fn run_server(db_pool: sqlx::PgPool) -> Result<(), Box<dyn std::error:
};
let table_script_service = TableScriptService { db_pool: db_pool.clone() };
let auth_service = AuthServiceImpl { db_pool: db_pool.clone() };
let search_service = SearcherService;
// MODIFIED: Instantiate SearcherService with the database pool
let search_service = SearcherService { pool: db_pool.clone() };
Server::builder()
.add_service(AdresarServer::new(AdresarService { db_pool: db_pool.clone() }))
@@ -62,7 +63,7 @@ pub async fn run_server(db_pool: sqlx::PgPool) -> Result<(), Box<dyn std::error:
.add_service(TablesDataServer::new(tables_data_service))
.add_service(TableScriptServer::new(table_script_service))
.add_service(AuthServiceServer::new(auth_service))
.add_service(SearcherServer::new(search_service))
.add_service(SearcherServer::new(search_service)) // This now works correctly
.add_service(reflection_service)
.serve(addr)
.await?;