slovak language tokenized search
This commit is contained in:
@@ -31,6 +31,7 @@ bcrypt = "0.17.0"
|
||||
validator = { version = "0.20.0", features = ["derive"] }
|
||||
uuid = { version = "1.16.0", features = ["serde", "v4"] }
|
||||
jsonwebtoken = "9.3.1"
|
||||
rust-stemmers = "1.2.0"
|
||||
|
||||
[lib]
|
||||
name = "server"
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
// In server/src/bin/manual_indexer.rs
|
||||
|
||||
use sqlx::{PgPool, Row};
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{doc, Index};
|
||||
use std::path::Path;
|
||||
|
||||
// --- CONFIGURATION ---
|
||||
// IMPORTANT: Change this to a table name that actually exists and has data in your test DB.
|
||||
// From your grpcurl output, "2025_test_post" is a good candidate.
|
||||
const TABLE_TO_INDEX: &str = "2025_test_post2";
|
||||
const INDEX_DIR: &str = "./tantivy_indexes";
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
// --- Database Connection ---
|
||||
// This assumes you have a .env file with DATABASE_URL
|
||||
dotenvy::dotenv().ok();
|
||||
let database_url = std::env::var("DATABASE_URL")
|
||||
.expect("DATABASE_URL must be set in your .env file");
|
||||
let pool = PgPool::connect(&database_url).await?;
|
||||
println!("Connected to database.");
|
||||
|
||||
// --- Tantivy Schema Definition ---
|
||||
let mut schema_builder = Schema::builder();
|
||||
// This field will store the original Postgres row ID. It's crucial.
|
||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||
// This field will contain ALL text data from the row, concatenated.
|
||||
schema_builder.add_text_field("all_text", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// --- Index Creation ---
|
||||
let index_path = Path::new(INDEX_DIR).join(TABLE_TO_INDEX);
|
||||
if index_path.exists() {
|
||||
println!("Removing existing index at: {}", index_path.display());
|
||||
std::fs::remove_dir_all(&index_path)?;
|
||||
}
|
||||
std::fs::create_dir_all(&index_path)?;
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let mut index_writer = index.writer(100_000_000)?; // 100MB heap
|
||||
|
||||
println!("Indexing table: {}", TABLE_TO_INDEX);
|
||||
|
||||
// --- Data Fetching and Indexing ---
|
||||
let qualified_table = format!("gen.\"{}\"", TABLE_TO_INDEX);
|
||||
let query_str = format!("SELECT id, to_jsonb(t) AS data FROM {} t", qualified_table);
|
||||
let rows = sqlx::query(&query_str).fetch_all(&pool).await?;
|
||||
|
||||
if rows.is_empty() {
|
||||
println!("Warning: No rows found in table '{}'. Index will be empty.", TABLE_TO_INDEX);
|
||||
}
|
||||
|
||||
let pg_id_field = schema.get_field("pg_id").unwrap();
|
||||
let all_text_field = schema.get_field("all_text").unwrap();
|
||||
|
||||
for row in &rows {
|
||||
let id: i64 = row.try_get("id")?;
|
||||
let data: serde_json::Value = row.try_get("data")?;
|
||||
|
||||
// Concatenate all text values from the JSON into one big string.
|
||||
let mut full_text = String::new();
|
||||
if let Some(obj) = data.as_object() {
|
||||
for value in obj.values() {
|
||||
if let Some(s) = value.as_str() {
|
||||
full_text.push_str(s);
|
||||
full_text.push(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add the document to Tantivy
|
||||
index_writer.add_document(doc!(
|
||||
pg_id_field => id as u64,
|
||||
all_text_field => full_text
|
||||
))?;
|
||||
}
|
||||
|
||||
// --- Finalize ---
|
||||
index_writer.commit()?;
|
||||
println!("Successfully indexed {} documents into '{}'", rows.len(), index_path.display());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,10 +2,11 @@
|
||||
|
||||
use std::path::Path;
|
||||
use sqlx::{PgPool, Row};
|
||||
use tantivy::schema::{Schema, Term, TEXT, STORED, INDEXED};
|
||||
use tantivy::schema::{Schema, Term};
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use tracing::{error, info, warn};
|
||||
use crate::search_schema;
|
||||
|
||||
const INDEX_DIR: &str = "./tantivy_indexes";
|
||||
|
||||
@@ -25,7 +26,6 @@ pub struct IndexCommandData {
|
||||
}
|
||||
|
||||
/// The main loop for the background indexer task.
|
||||
/// It listens for commands on the receiver and updates the Tantivy index.
|
||||
pub async fn indexer_task(pool: PgPool, mut receiver: Receiver<IndexCommand>) {
|
||||
info!("Background indexer task started.");
|
||||
while let Some(command) = receiver.recv().await {
|
||||
@@ -62,21 +62,13 @@ async fn handle_add_or_update(
|
||||
.await?;
|
||||
let json_data: serde_json::Value = row.try_get("data")?;
|
||||
|
||||
// 2. Prepare the Tantivy document
|
||||
let mut full_text = String::new();
|
||||
if let Some(obj) = json_data.as_object() {
|
||||
for value in obj.values() {
|
||||
if let Some(s) = value.as_str() {
|
||||
full_text.push_str(s);
|
||||
full_text.push(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
// 2. Extract all text content for Slovak processing
|
||||
let slovak_text = extract_text_content(&json_data);
|
||||
|
||||
// 3. Open the index and write the document
|
||||
let (mut writer, schema) = get_index_writer(&data.table_name)?;
|
||||
let pg_id_field = schema.get_field("pg_id").unwrap();
|
||||
let all_text_field = schema.get_field("all_text").unwrap();
|
||||
let text_sk_field = schema.get_field("text_sk").unwrap();
|
||||
|
||||
// First, delete any existing document with this ID to handle updates
|
||||
let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
|
||||
@@ -85,13 +77,13 @@ async fn handle_add_or_update(
|
||||
// Add the new document
|
||||
writer.add_document(doc!(
|
||||
pg_id_field => data.row_id as u64,
|
||||
all_text_field => full_text
|
||||
text_sk_field => slovak_text
|
||||
))?;
|
||||
|
||||
// 4. Commit changes
|
||||
writer.commit()?;
|
||||
info!(
|
||||
"Successfully indexed document id:{} for table:{}",
|
||||
"Successfully indexed Slovak document id:{} for table:{}",
|
||||
data.row_id, data.table_name
|
||||
);
|
||||
|
||||
@@ -122,19 +114,32 @@ async fn handle_delete(
|
||||
fn get_index_writer(
|
||||
table_name: &str,
|
||||
) -> anyhow::Result<(IndexWriter, Schema)> {
|
||||
let index_path = Path::new(INDEX_DIR).join(table_name);
|
||||
std::fs::create_dir_all(&index_path)?;
|
||||
|
||||
let index = Index::open_in_dir(&index_path).or_else(|_| {
|
||||
// If it doesn't exist, create it with the standard schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||
schema_builder.add_text_field("all_text", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
Index::create_in_dir(&index_path, schema)
|
||||
})?;
|
||||
|
||||
let index = search_schema::get_or_create_index(table_name)?;
|
||||
let schema = index.schema();
|
||||
let writer = index.writer(100_000_000)?; // 100MB heap
|
||||
Ok((writer, schema))
|
||||
}
|
||||
|
||||
/// Extract all text content from a JSON object for indexing
|
||||
fn extract_text_content(json_data: &serde_json::Value) -> String {
|
||||
let mut full_text = String::new();
|
||||
|
||||
if let Some(obj) = json_data.as_object() {
|
||||
for value in obj.values() {
|
||||
match value {
|
||||
serde_json::Value::String(s) => {
|
||||
full_text.push_str(s);
|
||||
full_text.push(' ');
|
||||
}
|
||||
serde_json::Value::Number(n) => {
|
||||
full_text.push_str(&n.to_string());
|
||||
full_text.push(' ');
|
||||
}
|
||||
// We could recursively handle nested objects if needed
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
full_text.trim().to_string()
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
pub mod db;
|
||||
pub mod auth;
|
||||
pub mod indexer;
|
||||
pub mod search_schema;
|
||||
pub mod server;
|
||||
pub mod adresar;
|
||||
pub mod uctovnictvo;
|
||||
|
||||
63
server/src/search_schema.rs
Normal file
63
server/src/search_schema.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
// server/src/search_schema.rs
|
||||
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
use std::path::Path;
|
||||
|
||||
/// Creates a Tantivy schema optimized for Slovak ngram search
|
||||
pub fn create_search_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// ID field to link back to PostgreSQL
|
||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||
|
||||
// Slovak text field with ngram tokenizer for search-as-you-type
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak") // KEEP THE SAME NAME
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
schema_builder.add_text_field("text_sk", text_options);
|
||||
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
/// Registers the Slovak ngram tokenizer with the index
|
||||
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
||||
let tokenizer_manager = index.tokenizers();
|
||||
|
||||
// Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
|
||||
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
||||
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
||||
)
|
||||
.filter(RemoveLongFilter::limit(40)) // Remove very long tokens
|
||||
.filter(LowerCaser) // Convert to lowercase
|
||||
.filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
|
||||
.build();
|
||||
|
||||
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets or creates an index for a table with proper Slovak ngram processing
|
||||
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
||||
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
||||
std::fs::create_dir_all(&index_path)?;
|
||||
|
||||
let index = if index_path.join("meta.json").exists() {
|
||||
Index::open_in_dir(&index_path)?
|
||||
} else {
|
||||
let schema = create_search_schema();
|
||||
Index::create_in_dir(&index_path, schema)?
|
||||
};
|
||||
|
||||
// Always register the tokenizer when opening
|
||||
register_slovak_tokenizer(&index)?;
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
Reference in New Issue
Block a user