better search but still has some flaws. It at least works, even tho its not perfect. Needs more testing, but im pretty happy with it rn, keeping it this way
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
// src/indexer.rs
|
||||
// server/src/indexer.rs
|
||||
|
||||
use std::path::Path;
|
||||
use sqlx::{PgPool, Row};
|
||||
use tantivy::schema::{Schema, Term};
|
||||
use tantivy::{doc, Index, IndexWriter};
|
||||
use tantivy::schema::Term;
|
||||
use tantivy::{doc, IndexWriter};
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use tracing::{error, info, warn};
|
||||
use tantivy::schema::Schema;
|
||||
use crate::search_schema;
|
||||
|
||||
const INDEX_DIR: &str = "./tantivy_indexes";
|
||||
@@ -49,44 +49,39 @@ async fn handle_add_or_update(
|
||||
pool: &PgPool,
|
||||
data: IndexCommandData,
|
||||
) -> anyhow::Result<()> {
|
||||
// 1. Fetch the full row data from PostgreSQL
|
||||
let qualified_table = format!("gen.\"{}\"", data.table_name);
|
||||
let query_str = format!(
|
||||
"SELECT to_jsonb(t) AS data FROM {} t WHERE id = $1",
|
||||
qualified_table
|
||||
);
|
||||
|
||||
let row = sqlx::query(&query_str)
|
||||
.bind(data.row_id)
|
||||
.fetch_one(pool)
|
||||
.await?;
|
||||
let json_data: serde_json::Value = row.try_get("data")?;
|
||||
|
||||
// 2. Extract all text content for Slovak processing
|
||||
let slovak_text = extract_text_content(&json_data);
|
||||
|
||||
// 3. Open the index and write the document
|
||||
let (mut writer, schema) = get_index_writer(&data.table_name)?;
|
||||
let pg_id_field = schema.get_field("pg_id").unwrap();
|
||||
let text_sk_field = schema.get_field("text_sk").unwrap();
|
||||
let prefix_edge_field = schema.get_field("prefix_edge").unwrap();
|
||||
let prefix_full_field = schema.get_field("prefix_full").unwrap();
|
||||
let text_ngram_field = schema.get_field("text_ngram").unwrap();
|
||||
|
||||
// First, delete any existing document with this ID to handle updates
|
||||
let id_term = Term::from_field_u64(pg_id_field, data.row_id as u64);
|
||||
writer.delete_term(id_term);
|
||||
|
||||
// Add the new document
|
||||
writer.add_document(doc!(
|
||||
pg_id_field => data.row_id as u64,
|
||||
text_sk_field => slovak_text
|
||||
prefix_edge_field => slovak_text.clone(),
|
||||
prefix_full_field => slovak_text.clone(),
|
||||
text_ngram_field => slovak_text
|
||||
))?;
|
||||
|
||||
// 4. Commit changes
|
||||
writer.commit()?;
|
||||
info!(
|
||||
"Successfully indexed Slovak document id:{} for table:{}",
|
||||
"Successfully indexed document id:{} for table:{}",
|
||||
data.row_id, data.table_name
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -123,7 +118,7 @@ fn get_index_writer(
|
||||
/// Extract all text content from a JSON object for indexing
|
||||
fn extract_text_content(json_data: &serde_json::Value) -> String {
|
||||
let mut full_text = String::new();
|
||||
|
||||
|
||||
if let Some(obj) = json_data.as_object() {
|
||||
for value in obj.values() {
|
||||
match value {
|
||||
@@ -135,11 +130,10 @@ fn extract_text_content(json_data: &serde_json::Value) -> String {
|
||||
full_text.push_str(&n.to_string());
|
||||
full_text.push(' ');
|
||||
}
|
||||
// We could recursively handle nested objects if needed
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
full_text.trim().to_string()
|
||||
}
|
||||
|
||||
@@ -5,59 +5,93 @@ use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
use std::path::Path;
|
||||
|
||||
/// Creates a Tantivy schema optimized for Slovak ngram search
|
||||
/// Creates a hybrid Slovak search schema with optimized prefix fields.
|
||||
pub fn create_search_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// ID field to link back to PostgreSQL
|
||||
schema_builder.add_u64_field("pg_id", INDEXED | STORED);
|
||||
|
||||
// Slovak text field with ngram tokenizer for search-as-you-type
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak") // KEEP THE SAME NAME
|
||||
// FIELD 1: For prefixes (1-15 chars).
|
||||
let short_prefix_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak_prefix_edge")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
let short_prefix_options = TextOptions::default()
|
||||
.set_indexing_options(short_prefix_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("prefix_edge", short_prefix_options);
|
||||
|
||||
schema_builder.add_text_field("text_sk", text_options);
|
||||
// FIELD 2: For the full word.
|
||||
let full_word_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak_prefix_full")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let full_word_options = TextOptions::default()
|
||||
.set_indexing_options(full_word_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("prefix_full", full_word_options);
|
||||
|
||||
// NGRAM FIELD: For substring matching.
|
||||
let ngram_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("slovak_ngram")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let ngram_options = TextOptions::default()
|
||||
.set_indexing_options(ngram_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("text_ngram", ngram_options);
|
||||
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
/// Registers the Slovak ngram tokenizer with the index
|
||||
/// Registers all necessary Slovak tokenizers with the index.
|
||||
pub fn register_slovak_tokenizer(index: &Index) -> tantivy::Result<()> {
|
||||
let tokenizer_manager = index.tokenizers();
|
||||
|
||||
// Create Slovak ngram tokenizer pipeline - BUT REGISTER AS "slovak"
|
||||
let slovak_ngram_tokenizer = TextAnalyzer::builder(
|
||||
NgramTokenizer::new(3, 3, false)? // min=3, max=3, prefix_only=false
|
||||
)
|
||||
.filter(RemoveLongFilter::limit(40)) // Remove very long tokens
|
||||
.filter(LowerCaser) // Convert to lowercase
|
||||
.filter(AsciiFoldingFilter) // Handle diacritics: č->c, š->s, ž->z, etc.
|
||||
.build();
|
||||
// TOKENIZER for `prefix_edge`: Edge N-gram (1-15 chars)
|
||||
if tokenizer_manager.get("slovak_prefix_edge").is_none() {
|
||||
// YOUR RECOMMENDED FIX: Extend the max_gram to a more practical limit.
|
||||
let tokenizer = TextAnalyzer::builder(
|
||||
NgramTokenizer::new(1, 15, true)?
|
||||
)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_prefix_edge", tokenizer);
|
||||
}
|
||||
|
||||
tokenizer_manager.register("slovak", slovak_ngram_tokenizer); // SAME NAME
|
||||
// TOKENIZER for `prefix_full`: Simple word tokenizer
|
||||
if tokenizer_manager.get("slovak_prefix_full").is_none() {
|
||||
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_prefix_full", tokenizer);
|
||||
}
|
||||
|
||||
// NGRAM TOKENIZER: For substring matching.
|
||||
if tokenizer_manager.get("slovak_ngram").is_none() {
|
||||
let tokenizer = TextAnalyzer::builder(
|
||||
NgramTokenizer::new(3, 3, false)?
|
||||
)
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(AsciiFoldingFilter)
|
||||
.build();
|
||||
tokenizer_manager.register("slovak_ngram", tokenizer);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets or creates an index for a table with proper Slovak ngram processing
|
||||
pub fn get_or_create_index(table_name: &str) -> tantivy::Result<Index> {
|
||||
let index_path = Path::new("./tantivy_indexes").join(table_name);
|
||||
std::fs::create_dir_all(&index_path)?;
|
||||
|
||||
let index = if index_path.join("meta.json").exists() {
|
||||
Index::open_in_dir(&index_path)?
|
||||
} else {
|
||||
let schema = create_search_schema();
|
||||
Index::create_in_dir(&index_path, schema)?
|
||||
};
|
||||
|
||||
// Always register the tokenizer when opening
|
||||
register_slovak_tokenizer(&index)?;
|
||||
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user